npm - whisper.rn - Versions diffs - 0.5.0-rc.9 → 0.5.0 - Mend

whisper.rn 0.5.0-rc.9 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (81) hide show

package/cpp/ggml-cpu/repack.h CHANGED Viewed

@@ -44,7 +44,14 @@ struct block_q4_Kx8 {
 };
 static_assert(sizeof(block_q4_Kx8) == sizeof(wsp_ggml_half) * 16 + K_SCALE_SIZE * 8 + QK_K * 4, "wrong q4_K block size/padding");
+struct block_q2_Kx8 {
+    wsp_ggml_half d[8];      // super-block scale for quantized scales
+    wsp_ggml_half dmin[8];   // super-block scale for quantized mins
+    uint8_t scales[128];  // scales and mins, quantized with 4 bits
+    uint8_t qs[512];    // 2--bit quants
+};
+static_assert(sizeof(block_q2_Kx8) == sizeof(wsp_ggml_half) * 16 + QK_K/2 + QK_K * 2, "wrong q2_K block size/padding");
 struct block_q8_Kx4 {
     float d[4];              // delta
     int8_t qs[QK_K * 4];     // quants
@@ -60,6 +67,13 @@ struct block_iq4_nlx4 {
 static_assert(sizeof(block_iq4_nlx4) == 4 * sizeof(wsp_ggml_half) + QK4_NL * 2, "wrong iq4_nlx4 block size/padding");
+struct block_iq4_nlx8 {
+    wsp_ggml_half d[8];            // deltas for 8 iq4_nl blocks
+    uint8_t   qs[QK4_NL * 4];  // nibbles / quants for 8 iq4_nl blocks
+};
+static_assert(sizeof(block_iq4_nlx8) == 8 * sizeof(wsp_ggml_half) + QK4_NL * 4, "wrong iq4_nlx8 block size/padding");
 #if defined(__cplusplus)
 extern "C" {
 #endif
@@ -71,12 +85,16 @@ void wsp_ggml_gemv_q4_0_4x4_q8_0(int n, float * WSP_GGML_RESTRICT s, size_t bs,
 void wsp_ggml_gemv_q4_0_4x8_q8_0(int n, float * WSP_GGML_RESTRICT s, size_t bs, const void * WSP_GGML_RESTRICT vx, const void * WSP_GGML_RESTRICT vy, int nr, int nc);
 void wsp_ggml_gemv_q4_0_8x8_q8_0(int n, float * WSP_GGML_RESTRICT s, size_t bs, const void * WSP_GGML_RESTRICT vx, const void * WSP_GGML_RESTRICT vy, int nr, int nc);
 void wsp_ggml_gemv_q4_K_8x8_q8_K(int n, float * WSP_GGML_RESTRICT s, size_t bs, const void * WSP_GGML_RESTRICT vx, const void * WSP_GGML_RESTRICT vy, int nr, int nc);
+void wsp_ggml_gemv_q2_K_8x8_q8_K(int n, float * WSP_GGML_RESTRICT s, size_t bs, const void * WSP_GGML_RESTRICT vx, const void * WSP_GGML_RESTRICT vy, int nr, int nc);
 void wsp_ggml_gemv_iq4_nl_4x4_q8_0(int n, float * WSP_GGML_RESTRICT s, size_t bs, const void * WSP_GGML_RESTRICT vx, const void * WSP_GGML_RESTRICT vy, int nr, int nc);
+void wsp_ggml_gemv_iq4_nl_8x8_q8_0(int n, float * WSP_GGML_RESTRICT s, size_t bs, const void * WSP_GGML_RESTRICT vx, const void * WSP_GGML_RESTRICT vy, int nr, int nc);
 void wsp_ggml_gemm_q4_0_4x4_q8_0(int n, float * WSP_GGML_RESTRICT s, size_t bs, const void * WSP_GGML_RESTRICT vx, const void * WSP_GGML_RESTRICT vy, int nr, int nc);
 void wsp_ggml_gemm_q4_0_4x8_q8_0(int n, float * WSP_GGML_RESTRICT s, size_t bs, const void * WSP_GGML_RESTRICT vx, const void * WSP_GGML_RESTRICT vy, int nr, int nc);
 void wsp_ggml_gemm_q4_0_8x8_q8_0(int n, float * WSP_GGML_RESTRICT s, size_t bs, const void * WSP_GGML_RESTRICT vx, const void * WSP_GGML_RESTRICT vy, int nr, int nc);
 void wsp_ggml_gemm_q4_K_8x8_q8_K(int n, float * WSP_GGML_RESTRICT s, size_t bs, const void * WSP_GGML_RESTRICT vx, const void * WSP_GGML_RESTRICT vy, int nr, int nc);
+void wsp_ggml_gemm_q2_K_8x8_q8_K(int n, float * WSP_GGML_RESTRICT s, size_t bs, const void * WSP_GGML_RESTRICT vx, const void * WSP_GGML_RESTRICT vy, int nr, int nc);
 void wsp_ggml_gemm_iq4_nl_4x4_q8_0(int n, float * WSP_GGML_RESTRICT s, size_t bs, const void * WSP_GGML_RESTRICT vx, const void * WSP_GGML_RESTRICT vy, int nr, int nc);
+void wsp_ggml_gemm_iq4_nl_8x8_q8_0(int n, float * WSP_GGML_RESTRICT s, size_t bs, const void * WSP_GGML_RESTRICT vx, const void * WSP_GGML_RESTRICT vy, int nr, int nc);
 // Native implementations
 void wsp_ggml_wsp_quantize_mat_q8_0_4x4_generic(const float * WSP_GGML_RESTRICT x, void * WSP_GGML_RESTRICT vy, int64_t k);
@@ -86,12 +104,16 @@ void wsp_ggml_gemv_q4_0_4x4_q8_0_generic(int n, float * WSP_GGML_RESTRICT s, siz
 void wsp_ggml_gemv_q4_0_4x8_q8_0_generic(int n, float * WSP_GGML_RESTRICT s, size_t bs, const void * WSP_GGML_RESTRICT vx, const void * WSP_GGML_RESTRICT vy, int nr, int nc);
 void wsp_ggml_gemv_q4_0_8x8_q8_0_generic(int n, float * WSP_GGML_RESTRICT s, size_t bs, const void * WSP_GGML_RESTRICT vx, const void * WSP_GGML_RESTRICT vy, int nr, int nc);
 void wsp_ggml_gemv_q4_K_8x8_q8_K_generic(int n, float * WSP_GGML_RESTRICT s, size_t bs, const void * WSP_GGML_RESTRICT vx, const void * WSP_GGML_RESTRICT vy, int nr, int nc);
+void wsp_ggml_gemv_q2_K_8x8_q8_K_generic(int n, float * WSP_GGML_RESTRICT s, size_t bs, const void * WSP_GGML_RESTRICT vx, const void * WSP_GGML_RESTRICT vy, int nr, int nc);
 void wsp_ggml_gemv_iq4_nl_4x4_q8_0_generic(int n, float * WSP_GGML_RESTRICT s, size_t bs, const void * WSP_GGML_RESTRICT vx, const void * WSP_GGML_RESTRICT vy, int nr, int nc);
+void wsp_ggml_gemv_iq4_nl_8x8_q8_0_generic(int n, float * WSP_GGML_RESTRICT s, size_t bs, const void * WSP_GGML_RESTRICT vx, const void * WSP_GGML_RESTRICT vy, int nr, int nc);
 void wsp_ggml_gemm_q4_0_4x4_q8_0_generic(int n, float * WSP_GGML_RESTRICT s, size_t bs, const void * WSP_GGML_RESTRICT vx, const void * WSP_GGML_RESTRICT vy, int nr, int nc);
 void wsp_ggml_gemm_q4_0_4x8_q8_0_generic(int n, float * WSP_GGML_RESTRICT s, size_t bs, const void * WSP_GGML_RESTRICT vx, const void * WSP_GGML_RESTRICT vy, int nr, int nc);
 void wsp_ggml_gemm_q4_0_8x8_q8_0_generic(int n, float * WSP_GGML_RESTRICT s, size_t bs, const void * WSP_GGML_RESTRICT vx, const void * WSP_GGML_RESTRICT vy, int nr, int nc);
 void wsp_ggml_gemm_q4_K_8x8_q8_K_generic(int n, float * WSP_GGML_RESTRICT s, size_t bs, const void * WSP_GGML_RESTRICT vx, const void * WSP_GGML_RESTRICT vy, int nr, int nc);
+void wsp_ggml_gemm_q2_K_8x8_q8_K_generic(int n, float * WSP_GGML_RESTRICT s, size_t bs, const void * WSP_GGML_RESTRICT vx, const void * WSP_GGML_RESTRICT vy, int nr, int nc);
 void wsp_ggml_gemm_iq4_nl_4x4_q8_0_generic(int n, float * WSP_GGML_RESTRICT s, size_t bs, const void * WSP_GGML_RESTRICT vx, const void * WSP_GGML_RESTRICT vy, int nr, int nc);
+void wsp_ggml_gemm_iq4_nl_8x8_q8_0_generic(int n, float * WSP_GGML_RESTRICT s, size_t bs, const void * WSP_GGML_RESTRICT vx, const void * WSP_GGML_RESTRICT vy, int nr, int nc);
 #if defined(__cplusplus)
 } // extern "C"

package/cpp/ggml-cpu/simd-mappings.h CHANGED Viewed

@@ -189,7 +189,7 @@ inline static float wsp_ggml_lookup_fp16_to_fp32(wsp_ggml_fp16_t f) {
 #define WSP_GGML_F32xt_LOAD(...)              WSP_GGML_F32xt_LOAD_IMPL(DEFAULT_PG, __VA_ARGS__)
 #define WSP_GGML_F32xt_STORE_IMPL(pg,a,b)     svst1_f32(pg, a, b)
 #define WSP_GGML_F32xt_STORE(...)             WSP_GGML_F32xt_STORE_IMPL(DEFAULT_PG, __VA_ARGS__)
-#define WSP_GGML_F32xt_FMA_IMPL(pg, a, b, c)  svmad_f32_m(pg, a, b, c)
+#define WSP_GGML_F32xt_FMA_IMPL(pg, a, b, c)  svmad_f32_m(pg, b, c, a)
 #define WSP_GGML_F32xt_FMA(...)               WSP_GGML_F32xt_FMA_IMPL(DEFAULT_PG, __VA_ARGS__)
 #define WSP_GGML_F32xt_ADD_IMPL(pg, a, b)     svadd_f32_m(pg, a, b)
 #define WSP_GGML_F32xt_ADD(...)               WSP_GGML_F32xt_ADD_IMPL(DEFAULT_PG, __VA_ARGS__)

package/cpp/ggml-cpu/traits.cpp CHANGED Viewed

@@ -10,7 +10,7 @@ extra_buffer_type::~extra_buffer_type() {}
 }  // namespace ggml::cpu
 bool wsp_ggml_cpu_extra_compute_forward(struct wsp_ggml_compute_params * params, struct wsp_ggml_tensor * op) {
-    for (auto extra : wsp_ggml_backend_cpu_get_extra_buffers_type()) {
+    for (auto extra : wsp_ggml_backend_cpu_get_extra_buffer_types()) {
         if (extra && extra->context) {
             auto buf_extra     = (ggml::cpu::extra_buffer_type *) extra->context;
             auto tensor_traits = buf_extra->get_tensor_traits(op);
@@ -23,7 +23,7 @@ bool wsp_ggml_cpu_extra_compute_forward(struct wsp_ggml_compute_params * params,
 }
 bool wsp_ggml_cpu_extra_work_size(int n_threads, const struct wsp_ggml_tensor * op, size_t * size) {
-    for (auto extra : wsp_ggml_backend_cpu_get_extra_buffers_type()) {
+    for (auto extra : wsp_ggml_backend_cpu_get_extra_buffer_types()) {
         if (extra && extra->context) {
             auto buf_extra     = (ggml::cpu::extra_buffer_type *) extra->context;
             auto tensor_traits = buf_extra->get_tensor_traits(op);

package/cpp/ggml-cpu/traits.h CHANGED Viewed

@@ -33,6 +33,6 @@ class extra_buffer_type {
 }  // namespace ggml::cpu
 // implemented in ggml-cpu.cpp.
-std::vector<wsp_ggml_backend_buffer_type_t> & wsp_ggml_backend_cpu_get_extra_buffers_type();
+std::vector<wsp_ggml_backend_buffer_type_t> & wsp_ggml_backend_cpu_get_extra_buffer_types();
 #endif

package/cpp/ggml-cpu/vec.cpp CHANGED Viewed

@@ -37,35 +37,35 @@ void wsp_ggml_vec_dot_f32(int n, float * WSP_GGML_RESTRICT s, size_t bs, const f
         for (int i = 0; i < np; i += wsp_ggml_f32_step) {
             ax1 = WSP_GGML_F32_VEC_LOAD(x + i);
             ay1 = WSP_GGML_F32_VEC_LOAD(y + i);
-            sum1 = WSP_GGML_F32_VEC_FMA(ax1, ay1, sum1);
+            sum1 = WSP_GGML_F32_VEC_FMA(sum1, ax1, ay1);
             ax2 = WSP_GGML_F32_VEC_LOAD(x + i + 1*wsp_ggml_f32_epr);
             ay2 = WSP_GGML_F32_VEC_LOAD(y + i + 1*wsp_ggml_f32_epr);
-            sum2 = WSP_GGML_F32_VEC_FMA(ax2, ay2, sum2);
+            sum2 = WSP_GGML_F32_VEC_FMA(sum2, ax2, ay2);
             ax3 = WSP_GGML_F32_VEC_LOAD(x + i + 2*wsp_ggml_f32_epr);
             ay3 = WSP_GGML_F32_VEC_LOAD(y + i + 2*wsp_ggml_f32_epr);
-            sum3 = WSP_GGML_F32_VEC_FMA(ax3, ay3, sum3);
+            sum3 = WSP_GGML_F32_VEC_FMA(sum3, ax3, ay3);
             ax4 = WSP_GGML_F32_VEC_LOAD(x + i + 3*wsp_ggml_f32_epr);
             ay4 = WSP_GGML_F32_VEC_LOAD(y + i + 3*wsp_ggml_f32_epr);
-            sum4 = WSP_GGML_F32_VEC_FMA(ax4, ay4, sum4);
+            sum4 = WSP_GGML_F32_VEC_FMA(sum4, ax4, ay4);
             ax5 = WSP_GGML_F32_VEC_LOAD(x + i + 4*wsp_ggml_f32_epr);
             ay5 = WSP_GGML_F32_VEC_LOAD(y + i + 4*wsp_ggml_f32_epr);
-            sum5 = WSP_GGML_F32_VEC_FMA(ax5, ay5, sum5);
+            sum5 = WSP_GGML_F32_VEC_FMA(sum5, ax5, ay5);
             ax6 = WSP_GGML_F32_VEC_LOAD(x + i + 5*wsp_ggml_f32_epr);
             ay6 = WSP_GGML_F32_VEC_LOAD(y + i + 5*wsp_ggml_f32_epr);
-            sum6 = WSP_GGML_F32_VEC_FMA(ax6, ay6, sum6);
+            sum6 = WSP_GGML_F32_VEC_FMA(sum6, ax6, ay6);
             ax7 = WSP_GGML_F32_VEC_LOAD(x + i + 6*wsp_ggml_f32_epr);
             ay7 = WSP_GGML_F32_VEC_LOAD(y + i + 6*wsp_ggml_f32_epr);
-            sum7 = WSP_GGML_F32_VEC_FMA(ax7, ay7, sum7);
+            sum7 = WSP_GGML_F32_VEC_FMA(sum7, ax7, ay7);
             ax8 = WSP_GGML_F32_VEC_LOAD(x + i + 7*wsp_ggml_f32_epr);
             ay8 = WSP_GGML_F32_VEC_LOAD(y + i + 7*wsp_ggml_f32_epr);
-            sum8 = WSP_GGML_F32_VEC_FMA(ax8, ay8, sum8);
+            sum8 = WSP_GGML_F32_VEC_FMA(sum8, ax8, ay8);
         }
         // leftovers
         // Since 8 unrolls are done in above loop, leftovers lie in range [0, wsp_ggml_f32_step] which is handled in below loop
@@ -73,7 +73,7 @@ void wsp_ggml_vec_dot_f32(int n, float * WSP_GGML_RESTRICT s, size_t bs, const f
         for (int i = np; i < np2; i += wsp_ggml_f32_epr) {
             ax1 = WSP_GGML_F32_VEC_LOAD(x + i);
             ay1 = WSP_GGML_F32_VEC_LOAD(y + i);
-            sum1 = WSP_GGML_F32_VEC_FMA(ax1, ay1, sum1);
+            sum1 = WSP_GGML_F32_VEC_FMA(sum1, ax1, ay1);
         }
         // maximum number of leftover elements will be less that wsp_ggml_f32_epr. Apply predicated svmad on available elements only
         if (np2 < n) {
@@ -221,6 +221,9 @@ void wsp_ggml_vec_dot_f16(int n, float * WSP_GGML_RESTRICT s, size_t bs, wsp_ggm
     for (int i = np; i < n; ++i) {
         sumf += (wsp_ggml_float)(WSP_GGML_CPU_FP16_TO_FP32(x[i])*WSP_GGML_CPU_FP16_TO_FP32(y[i]));
     }
+    // if you hit this, you are likely running outside the FP range
+    assert(!isnan(sumf) && !isinf(sumf));
 #else
     for (int i = 0; i < n; ++i) {
         sumf += (wsp_ggml_float)(WSP_GGML_CPU_FP16_TO_FP32(x[i])*WSP_GGML_CPU_FP16_TO_FP32(y[i]));

package/cpp/ggml-cpu/vec.h CHANGED Viewed

@@ -55,7 +55,22 @@ inline static void wsp_ggml_vec_cpy_i32(const int n, int32_t * y, const int32_t
 inline static void wsp_ggml_vec_set_f16(const int n, wsp_ggml_fp16_t * x, const wsp_ggml_fp16_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
 inline static void wsp_ggml_vec_set_bf16(const int n, wsp_ggml_bf16_t * x, const wsp_ggml_bf16_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
-inline static void wsp_ggml_vec_add_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i]  = x[i] + y[i]; }
+inline static void wsp_ggml_vec_add_f32 (const int n, float * z, const float * x, const float * y) {
+    int i = 0;
+#if defined(__AVX2__)
+    for (; i + 7 < n; i += 8) {
+        __m256 vx = _mm256_loadu_ps(x + i);
+        __m256 vy = _mm256_loadu_ps(y + i);
+        __m256 vz = _mm256_add_ps(vx, vy);
+        _mm256_storeu_ps(z + i, vz);
+    }
+#endif
+    for (; i < n; ++i) {
+        z[i] = x[i] + y[i];
+    }
+}
 inline static void wsp_ggml_vec_add_f16 (const int n, wsp_ggml_fp16_t * z, const wsp_ggml_fp16_t * x, const wsp_ggml_fp16_t * y) {
     for (int i = 0; i < n; ++i) {
         z[i] = WSP_GGML_CPU_FP32_TO_FP16(WSP_GGML_CPU_FP16_TO_FP32(x[i]) + WSP_GGML_CPU_FP16_TO_FP32(y[i]));
@@ -163,49 +178,49 @@ inline static void wsp_ggml_vec_mad_f32(const int n, float * WSP_GGML_RESTRICT y
             ax1 = WSP_GGML_F32_VEC_LOAD(x + i);
             ay1 = WSP_GGML_F32_VEC_LOAD(y + i);
-            ay1 = WSP_GGML_F32_VEC_FMA(ax1, vx, ay1);
+            ay1 = WSP_GGML_F32_VEC_FMA(ay1, ax1, vx);
             WSP_GGML_F32_VEC_STORE(y + i, ay1);
             ax2 = WSP_GGML_F32_VEC_LOAD(x + i + 1*wsp_ggml_f32_epr);
             ay2 = WSP_GGML_F32_VEC_LOAD(y + i + 1*wsp_ggml_f32_epr);
-            ay2 = WSP_GGML_F32_VEC_FMA(ax2, vx, ay2);
+            ay2 = WSP_GGML_F32_VEC_FMA(ay2, ax2, vx);
             WSP_GGML_F32_VEC_STORE(y + i + 1*wsp_ggml_f32_epr, ay2);
             ax3 = WSP_GGML_F32_VEC_LOAD(x + i + 2*wsp_ggml_f32_epr);
             ay3 = WSP_GGML_F32_VEC_LOAD(y + i + 2*wsp_ggml_f32_epr);
-            ay3 = WSP_GGML_F32_VEC_FMA(ax3, vx, ay3);
+            ay3 = WSP_GGML_F32_VEC_FMA(ay3, ax3, vx);
             WSP_GGML_F32_VEC_STORE(y + i + 2*wsp_ggml_f32_epr, ay3);
             ax4 = WSP_GGML_F32_VEC_LOAD(x + i + 3*wsp_ggml_f32_epr);
             ay4 = WSP_GGML_F32_VEC_LOAD(y + i + 3*wsp_ggml_f32_epr);
-            ay4 = WSP_GGML_F32_VEC_FMA(ax4, vx, ay4);
+            ay4 = WSP_GGML_F32_VEC_FMA(ay4, ax4, vx);
             WSP_GGML_F32_VEC_STORE(y + i + 3*wsp_ggml_f32_epr, ay4);
             ax5 = WSP_GGML_F32_VEC_LOAD(x + i + 4*wsp_ggml_f32_epr);
             ay5 = WSP_GGML_F32_VEC_LOAD(y + i + 4*wsp_ggml_f32_epr);
-            ay5 = WSP_GGML_F32_VEC_FMA(ax5, vx, ay5);
+            ay5 = WSP_GGML_F32_VEC_FMA(ay5, ax5, vx);
             WSP_GGML_F32_VEC_STORE(y + i + 4*wsp_ggml_f32_epr, ay5);
             ax6 = WSP_GGML_F32_VEC_LOAD(x + i + 5*wsp_ggml_f32_epr);
             ay6 = WSP_GGML_F32_VEC_LOAD(y + i + 5*wsp_ggml_f32_epr);
-            ay6 = WSP_GGML_F32_VEC_FMA(ax6, vx, ay6);
+            ay6 = WSP_GGML_F32_VEC_FMA(ay6, ax6, vx);
             WSP_GGML_F32_VEC_STORE(y + i + 5*wsp_ggml_f32_epr, ay6);
             ax7 = WSP_GGML_F32_VEC_LOAD(x + i + 6*wsp_ggml_f32_epr);
             ay7 = WSP_GGML_F32_VEC_LOAD(y + i + 6*wsp_ggml_f32_epr);
-            ay7 = WSP_GGML_F32_VEC_FMA(ax7, vx, ay7);
+            ay7 = WSP_GGML_F32_VEC_FMA(ay7, ax7, vx);
             WSP_GGML_F32_VEC_STORE(y + i + 6*wsp_ggml_f32_epr, ay7);
             ax8 = WSP_GGML_F32_VEC_LOAD(x + i + 7*wsp_ggml_f32_epr);
             ay8 = WSP_GGML_F32_VEC_LOAD(y + i + 7*wsp_ggml_f32_epr);
-            ay8 = WSP_GGML_F32_VEC_FMA(ax8, vx, ay8);
+            ay8 = WSP_GGML_F32_VEC_FMA(ay8, ax8, vx);
             WSP_GGML_F32_VEC_STORE(y + i + 7*wsp_ggml_f32_epr, ay8);
         }
@@ -215,7 +230,7 @@ inline static void wsp_ggml_vec_mad_f32(const int n, float * WSP_GGML_RESTRICT y
         for (int i = np; i < np2; i += wsp_ggml_f32_epr) {
             ax1 = WSP_GGML_F32_VEC_LOAD(x + i);
             ay1 = WSP_GGML_F32_VEC_LOAD(y + i);
-            ay1 = WSP_GGML_F32_VEC_FMA(ax1, vx, ay1);
+            ay1 = WSP_GGML_F32_VEC_FMA(ay1, ax1, vx);
             WSP_GGML_F32_VEC_STORE(y + i, ay1);
         }
@@ -351,6 +366,45 @@ inline static void wsp_ggml_vec_mad_f32_unroll(const int n, const int xs, const
 #endif
 }
+inline static void wsp_ggml_vec_mad1_f32(const int n, float * y, const float * x, const float s, const float b) {
+#if defined(WSP_GGML_USE_ACCELERATE)
+    vDSP_vsmsa(x, 1, &s, &b, y, 1, n);
+#elif defined(WSP_GGML_SIMD)
+    #if defined(__ARM_FEATURE_SVE)
+        // scalar ; TODO: Write SVE code
+        for (int i = 0; i < n; ++i) {
+            y[i] = x[i]*s + b;
+        }
+    #else
+        const int np = (n & ~(WSP_GGML_F32_STEP - 1));
+        WSP_GGML_F32_VEC vs = WSP_GGML_F32_VEC_SET1(s);
+        WSP_GGML_F32_VEC vb = WSP_GGML_F32_VEC_SET1(b);
+        WSP_GGML_F32_VEC ay[WSP_GGML_F32_ARR];
+        for (int i = 0; i < np; i += WSP_GGML_F32_STEP) {
+            for (int j = 0; j < WSP_GGML_F32_ARR; j++) {
+                ay[j] = WSP_GGML_F32_VEC_LOAD(x + i + j*WSP_GGML_F32_EPR);
+                ay[j] = WSP_GGML_F32_VEC_FMA(ay[j], vs, vb);
+                WSP_GGML_F32_VEC_STORE(y + i + j*WSP_GGML_F32_EPR, ay[j]);
+            }
+        }
+        // leftovers
+        for (int i = np; i < n; ++i) {
+            y[i] = x[i]*s + b;
+        }
+    #endif
+#else
+    // scalar
+    for (int i = 0; i < n; ++i) {
+        y[i] = x[i]*s + b;
+    }
+#endif
+}
 //inline static void wsp_ggml_vec_scale_f32(const int n, float * y, const float   v) { for (int i = 0; i < n; ++i) y[i] *= v;          }
 inline static void wsp_ggml_vec_scale_f32(const int n, float * y, const float   v) {
 #if defined(WSP_GGML_USE_ACCELERATE)
@@ -953,9 +1007,49 @@ void wsp_ggml_vec_swiglu_f32(const int n, float * y, const float * x, const floa
 inline static void wsp_ggml_vec_swiglu_f16(const int n, wsp_ggml_fp16_t * y, const wsp_ggml_fp16_t * x, const wsp_ggml_fp16_t * g) {
     for (int i = 0; i < n; ++i) {
-        float v = WSP_GGML_CPU_FP16_TO_FP32(x[i]);
-        float w = WSP_GGML_CPU_FP16_TO_FP32(g[i]);
-        y[i] = WSP_GGML_CPU_FP32_TO_FP16((v/(1.0f + expf(-v))) * w);
+        float xi = WSP_GGML_CPU_FP16_TO_FP32(x[i]);
+        float gi = WSP_GGML_CPU_FP16_TO_FP32(g[i]);
+        y[i] = WSP_GGML_CPU_FP32_TO_FP16((xi/(1.0f + expf(-xi))) * gi);
+    }
+}
+inline static void wsp_ggml_vec_geglu_erf_f32(const int n, float * y, const float * x, const float * g) {
+    for (int i = 0; i < n; ++i) {
+        float xi = x[i];
+        y[i] = 0.5f * xi * (1.0f + erff(xi*SQRT_2_INV)) * g[i];
+    }
+}
+inline static void wsp_ggml_vec_geglu_erf_f16(const int n, wsp_ggml_fp16_t * y, const wsp_ggml_fp16_t * x, const wsp_ggml_fp16_t * g) {
+    for (int i = 0; i < n; ++i) {
+        float xi = WSP_GGML_CPU_FP16_TO_FP32(x[i]);
+        float gi = WSP_GGML_CPU_FP16_TO_FP32(g[i]);
+        y[i] = WSP_GGML_CPU_FP32_TO_FP16(0.5f * xi * (1.0f + erff(xi*SQRT_2_INV)) * gi);
+    }
+}
+#ifdef WSP_GGML_GELU_QUICK_FP16
+inline static void wsp_ggml_vec_geglu_quick_f32(const int n, float * y, const float * x, const float * g) {
+    uint16_t t;
+    for (int i = 0; i < n; ++i) {
+        wsp_ggml_fp16_t fp16 = WSP_GGML_CPU_FP32_TO_FP16(x[i]);
+        memcpy(&t, &fp16, sizeof(uint16_t));
+        y[i] = WSP_GGML_CPU_FP16_TO_FP32(wsp_ggml_table_gelu_quick_f16[t]) * g[i];
+    }
+}
+#else
+inline static void wsp_ggml_vec_geglu_quick_f32(const int n, float * y, const float * x, const float * g) {
+    for (int i = 0; i < n; ++i) {
+        y[i] = wsp_ggml_gelu_quick_f32(x[i]) * g[i];
+    }
+}
+#endif
+inline static void wsp_ggml_vec_geglu_quick_f16(const int n, wsp_ggml_fp16_t * y, const wsp_ggml_fp16_t * x, const wsp_ggml_fp16_t * g) {
+    const uint16_t * i16 = (const uint16_t *) x;
+    for (int i = 0; i < n; ++i) {
+        float v = WSP_GGML_CPU_FP16_TO_FP32(g[i]);
+        y[i] = WSP_GGML_CPU_FP32_TO_FP16(WSP_GGML_CPU_FP16_TO_FP32(wsp_ggml_table_gelu_quick_f16[i16[i]]) * v);
     }
 }

package/cpp/ggml-impl.h CHANGED Viewed

@@ -73,6 +73,22 @@ static inline int wsp_ggml_up(int n, int m) {
     return (n + m - 1) & ~(m - 1);
 }
+// TODO: move to ggml.h?
+static bool wsp_ggml_are_same_layout(const struct wsp_ggml_tensor * a, const struct wsp_ggml_tensor * b) {
+    if (a->type != b->type) {
+        return false;
+    }
+    for (int i = 0; i < WSP_GGML_MAX_DIMS; i++) {
+        if (a->ne[i] != b->ne[i]) {
+            return false;
+        }
+        if (a->nb[i] != b->nb[i]) {
+            return false;
+        }
+    }
+    return true;
+}
 //
 // logging
 //
@@ -394,6 +410,67 @@ static inline wsp_ggml_fp16_t wsp_ggml_compute_fp32_to_fp16(float f) {
 #define WSP_GGML_FP16_TO_FP32(x) WSP_GGML_COMPUTE_FP16_TO_FP32(x)
 #define WSP_GGML_FP32_TO_FP16(x) WSP_GGML_COMPUTE_FP32_TO_FP16(x)
+static inline float wsp_ggml_e8m0_to_fp32(uint8_t x) {
+    uint32_t bits;  // Stores the raw bit representation of the float
+    // Handle special case for minimum exponent (denormalized float)
+    if (x == 0) {
+        // Bit pattern for 2^(-127):
+        // - Sign bit: 0 (positive)
+        // - Exponent: 0 (denormalized number)
+        // - Mantissa: 0x400000 (0.5 in fractional form)
+        // Value = 0.5 * 2^(-126) = 2^(-127)
+        bits = 0x00400000;
+    }
+    // note: disabled as we don't need to handle NaNs
+    //// Handle special case for NaN (all bits set)
+    //else if (x == 0xFF) {
+    //    // Standard quiet NaN pattern:
+    //    // - Sign bit: 0
+    //    // - Exponent: all 1s (0xFF)
+    //    // - Mantissa: 0x400000 (quiet NaN flag)
+    //    bits = 0x7FC00000;
+    //}
+    // Normalized values (most common case)
+    else {
+        // Construct normalized float by shifting exponent into position:
+        // - Exponent field: 8 bits (positions 30-23)
+        // - Mantissa: 0 (implicit leading 1)
+        // Value = 2^(x - 127)
+        bits = (uint32_t) x << 23;
+    }
+    float result;  // Final float value
+                   // Safely reinterpret bit pattern as float without type-punning issues
+    memcpy(&result, &bits, sizeof(float));
+    return result;
+}
+// Equal to wsp_ggml_e8m0_to_fp32/2
+// Useful with MXFP4 quantization since the E0M2 values are doubled
+static inline float wsp_ggml_e8m0_to_fp32_half(uint8_t x) {
+    uint32_t bits;
+    // For x < 2: use precomputed denormal patterns
+    if (x < 2) {
+        // 0x00200000 = 2^(-128), 0x00400000 = 2^(-127)
+        bits = 0x00200000 << x;
+    }
+    // For x >= 2: normalized exponent adjustment
+    else {
+        // 0.5 * 2^(x-127) = 2^(x-128) = normalized with exponent (x-1)
+        bits = (uint32_t)(x - 1) << 23;
+    }
+    // Note: NaNs are not handled here
+    float result;
+    memcpy(&result, &bits, sizeof(float));
+    return result;
+}
+#define WSP_GGML_E8M0_TO_FP32(x) wsp_ggml_e8m0_to_fp32(x)
+#define WSP_GGML_E8M0_TO_FP32_HALF(x) wsp_ggml_e8m0_to_fp32_half(x)
 /**
  * Converts brain16 to float32.
  *

package/cpp/ggml-metal-impl.h CHANGED Viewed

@@ -23,6 +23,9 @@
 #define N_R0_Q8_0 4
 #define N_SG_Q8_0 2
+#define N_R0_MXFP4 2
+#define N_SG_MXFP4 2
 #define N_R0_Q2_K 4
 #define N_SG_Q2_K 2
@@ -126,8 +129,18 @@ typedef struct {
     uint64_t nb2;
     uint64_t nb3;
     uint64_t offs;
+    uint64_t o1[8];
 } wsp_ggml_metal_kargs_bin;
+typedef struct {
+    int64_t ne0;
+    int64_t ne1;
+    size_t nb01;
+    size_t nb02;
+    size_t nb11;
+    size_t nb21;
+} wsp_ggml_metal_kargs_add_id;
 typedef struct {
     int32_t  ne00;
     int32_t  ne01;
@@ -229,14 +242,18 @@ typedef struct {
     uint64_t nb21;
     uint64_t nb22;
     uint64_t nb23;
+    int32_t  ne32;
+    int32_t  ne33;
     uint64_t nb31;
+    uint64_t nb32;
+    uint64_t nb33;
     int32_t  ne1;
     int32_t  ne2;
     float    scale;
     float    max_bias;
     float    m0;
     float    m1;
-    uint16_t n_head_log2;
+    int32_t  n_head_log2;
     float    logit_softcap;
 } wsp_ggml_metal_kargs_flash_attn_ext;
@@ -373,8 +390,16 @@ typedef struct {
 typedef struct {
     int32_t  ne00;
     int32_t  ne00_4;
-    uint64_t nb01;
+    uint64_t nb1;
+    uint64_t nb2;
+    uint64_t nb3;
     float    eps;
+    int32_t  nef1[3];
+    int32_t  nef2[3];
+    int32_t  nef3[3];
+    uint64_t nbf1[3];
+    uint64_t nbf2[3];
+    uint64_t nbf3[3];
 } wsp_ggml_metal_kargs_rms_norm;
 typedef struct {
@@ -431,6 +456,8 @@ typedef struct{
     uint64_t nb1;
     int32_t  i00;
     int32_t  i10;
+    float    alpha;
+    float    limit;
 } wsp_ggml_metal_kargs_glu;
 typedef struct {
@@ -461,14 +488,26 @@ typedef struct {
 } wsp_ggml_metal_kargs_sum_rows;
 typedef struct {
-    int64_t  ne00;
-    int64_t  ne01;
-    int64_t  ne02;
+    int32_t  ne00;
+    int32_t  ne01;
+    int32_t  ne02;
+    uint64_t nb01;
+    uint64_t nb02;
+    uint64_t nb03;
+    int32_t  ne11;
+    int32_t  ne12;
+    int32_t  ne13;
+    uint64_t nb11;
+    uint64_t nb12;
+    uint64_t nb13;
+    uint64_t nb1;
+    uint64_t nb2;
+    uint64_t nb3;
     float    scale;
     float    max_bias;
     float    m0;
     float    m1;
-    uint32_t n_head_log2;
+    int32_t  n_head_log2;
 } wsp_ggml_metal_kargs_soft_max;
 typedef struct {
@@ -499,26 +538,26 @@ typedef struct {
 typedef struct {
     int64_t  d_state;
     int64_t  d_inner;
+    int64_t  n_head;
+    int64_t  n_group;
     int64_t  n_seq_tokens;
     int64_t  n_seqs;
-    uint64_t nb00;
+    int64_t  s_off;
     uint64_t nb01;
     uint64_t nb02;
-    uint64_t nb10;
+    uint64_t nb03;
     uint64_t nb11;
     uint64_t nb12;
     uint64_t nb13;
-    uint64_t nb20;
     uint64_t nb21;
     uint64_t nb22;
-    uint64_t nb30;
     uint64_t nb31;
-    uint64_t nb40;
     uint64_t nb41;
     uint64_t nb42;
-    uint64_t nb50;
+    uint64_t nb43;
     uint64_t nb51;
     uint64_t nb52;
+    uint64_t nb53;
 } wsp_ggml_metal_kargs_ssm_scan;
 typedef struct {