npm - whisper.rn - Versions diffs - 0.4.0-rc.10 → 0.4.0-rc.11 - Mend

whisper.rn 0.4.0-rc.10 → 0.4.0-rc.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (41) hide show

package/android/src/main/CMakeLists.txt +9 -3
package/cpp/ggml-alloc.c +6 -14
package/cpp/ggml-backend-impl.h +50 -11
package/cpp/ggml-backend-reg.cpp +409 -31
package/cpp/ggml-backend.cpp +9 -3
package/cpp/ggml-backend.h +18 -0
package/cpp/ggml-common.h +41 -43
package/cpp/ggml-cpp.h +1 -0
package/cpp/{ggml-cpu-aarch64.c → ggml-cpu-aarch64.cpp} +941 -254
package/cpp/ggml-cpu-aarch64.h +2 -24
package/cpp/ggml-cpu-impl.h +171 -11
package/cpp/ggml-cpu-quants.c +1812 -389
package/cpp/ggml-cpu-traits.cpp +36 -0
package/cpp/ggml-cpu-traits.h +38 -0
package/cpp/ggml-cpu.c +1432 -610
package/cpp/ggml-cpu.cpp +131 -141
package/cpp/ggml-cpu.h +10 -50
package/cpp/ggml-impl.h +27 -11
package/cpp/ggml-metal-impl.h +39 -0
package/cpp/ggml-metal.h +1 -1
package/cpp/ggml-metal.m +1031 -359
package/cpp/ggml-opt.cpp +854 -0
package/cpp/ggml-opt.h +216 -0
package/cpp/ggml-quants.c +0 -9
package/cpp/ggml-threading.h +4 -2
package/cpp/ggml-whisper.metallib +0 -0
package/cpp/ggml.c +501 -1537
package/cpp/ggml.h +144 -171
package/cpp/gguf.cpp +1329 -0
package/cpp/gguf.h +202 -0
package/cpp/whisper.cpp +254 -114
package/cpp/whisper.h +6 -3
package/lib/commonjs/version.json +1 -1
package/lib/module/version.json +1 -1
package/package.json +1 -1
package/src/version.json +1 -1
package/whisper-rn.podspec +2 -2
package/cpp/README.md +0 -4
package/cpp/ggml-aarch64.c +0 -129
package/cpp/ggml-aarch64.h +0 -19
package/cpp/ggml-backend.cpp.rej +0 -12

package/cpp/ggml-cpu.c CHANGED Viewed

@@ -3,11 +3,10 @@
 #include "ggml-backend-impl.h"
 #include "ggml-backend.h"
-#include "ggml-cpu-aarch64.h"
+#include "ggml-cpu-traits.h"
 #include "ggml-cpu-impl.h"
 #include "ggml-cpu.h"
 #include "ggml-impl.h"
-#include "ggml-quants.h"
 #include "ggml-cpu-quants.h"
 #include "ggml-threading.h"
 #include "ggml.h"
@@ -109,10 +108,12 @@ static wsp_ggml_fp16_t wsp_ggml_table_gelu_quick_f16[1 << 16];
 #if defined(__ARM_ARCH)
 struct wsp_ggml_arm_arch_features_type {
     int has_neon;
+    int has_dotprod;
     int has_i8mm;
     int has_sve;
     int sve_cnt;
-} wsp_ggml_arm_arch_features = {-1, -1, -1, 0};
+    int has_sme;
+} wsp_ggml_arm_arch_features = {-1, -1, -1, -1, 0, -1};
 #endif
@@ -124,8 +125,7 @@ struct wsp_ggml_arm_arch_features_type {
 #endif
 #include <windows.h>
-#if !defined(__clang__)
+#if defined(_MSC_VER) && !defined(__clang__)
 #define WSP_GGML_CACHE_ALIGN __declspec(align(WSP_GGML_CACHE_LINE))
 typedef volatile LONG atomic_int;
@@ -222,10 +222,6 @@ typedef void * thread_ret_t;
 typedef pthread_t wsp_ggml_thread_t;
-#ifdef WSP_GGML_USE_CPU_HBM
-#include <hbwmalloc.h>
-#endif
 #if defined(__APPLE__)
 #include <unistd.h>
 #include <mach/mach.h>
@@ -241,6 +237,8 @@ typedef pthread_t wsp_ggml_thread_t;
 #else
 #if defined(__POWER9_VECTOR__)
 #define CACHE_LINE_SIZE 128
+#elif defined(__VXE__) || defined(__VXE2__)
+#define CACHE_LINE_SIZE 256
 #else
 #define CACHE_LINE_SIZE 64
 #endif
@@ -299,7 +297,6 @@ static const struct wsp_ggml_type_traits_cpu type_traits_cpu[WSP_GGML_TYPE_COUNT
     },
     [WSP_GGML_TYPE_Q8_0] = {
         .from_float               = wsp_quantize_row_q8_0,
-        .from_float_to_mat        = wsp_quantize_mat_q8_0,
         .vec_dot                  = wsp_ggml_vec_dot_q8_0_q8_0,
         .vec_dot_type             = WSP_GGML_TYPE_Q8_0,
 #if defined (__ARM_FEATURE_MATMUL_INT8)
@@ -407,33 +404,6 @@ static const struct wsp_ggml_type_traits_cpu type_traits_cpu[WSP_GGML_TYPE_COUNT
         .vec_dot_type             = WSP_GGML_TYPE_BF16,
         .nrows                    = 1,
     },
-    [WSP_GGML_TYPE_Q4_0_4_4] = {
-        .from_float               = NULL,
-        .vec_dot                  = NULL,
-        .vec_dot_type             = WSP_GGML_TYPE_Q8_0,
-        .nrows                    = 1,
-        .ncols                    = 4,
-        .gemv                     = wsp_ggml_gemv_q4_0_4x4_q8_0,
-        .gemm                     = wsp_ggml_gemm_q4_0_4x4_q8_0,
-    },
-    [WSP_GGML_TYPE_Q4_0_4_8] = {
-        .from_float               = NULL,
-        .vec_dot                  = NULL,
-        .vec_dot_type             = WSP_GGML_TYPE_Q8_0,
-        .nrows                    = 1,
-        .ncols                    = 4,
-        .gemv                     = wsp_ggml_gemv_q4_0_4x8_q8_0,
-        .gemm                     = wsp_ggml_gemm_q4_0_4x8_q8_0,
-    },
-    [WSP_GGML_TYPE_Q4_0_8_8] = {
-        .from_float               = NULL,
-        .vec_dot                  = NULL,
-        .vec_dot_type             = WSP_GGML_TYPE_Q8_0,
-        .nrows                    = 1,
-        .ncols                    = 8,
-        .gemv                     = wsp_ggml_gemv_q4_0_8x8_q8_0,
-        .gemm                     = wsp_ggml_gemm_q4_0_8x8_q8_0,
-    },
     [WSP_GGML_TYPE_TQ1_0] = {
         .from_float               = wsp_quantize_row_tq1_0,
         .vec_dot                  = wsp_ggml_vec_dot_tq1_0_q8_K,
@@ -485,21 +455,21 @@ const struct wsp_ggml_type_traits_cpu * wsp_ggml_get_type_traits_cpu(enum wsp_gg
 #define WSP_GGML_F32x4_ADD          vaddq_f32
 #define WSP_GGML_F32x4_MUL          vmulq_f32
 #define WSP_GGML_F32x4_REDUCE_ONE(x) vaddvq_f32(x)
-#define WSP_GGML_F32x4_REDUCE(res, x)                  \
-{                                                  \
-    int offset = WSP_GGML_F32_ARR >> 1;                \
-    for (int i = 0; i < offset; ++i) {             \
-        (x)[i] = vaddq_f32((x)[i], (x)[offset+i]); \
-    }                                              \
-    offset >>= 1;                                  \
-    for (int i = 0; i < offset; ++i) {             \
-        (x)[i] = vaddq_f32((x)[i], (x)[offset+i]); \
-    }                                              \
-    offset >>= 1;                                  \
-    for (int i = 0; i < offset; ++i) {             \
-        (x)[i] = vaddq_f32((x)[i], (x)[offset+i]); \
-    }                                              \
-    (res) = WSP_GGML_F32x4_REDUCE_ONE((x)[0]);         \
+#define WSP_GGML_F32x4_REDUCE(res, x)                       \
+{                                                       \
+    int offset = WSP_GGML_F32_ARR >> 1;                     \
+    for (int i = 0; i < offset; ++i) {                  \
+        (x)[i] = vaddq_f32((x)[i], (x)[offset+i]);      \
+    }                                                   \
+    offset >>= 1;                                       \
+    for (int i = 0; i < offset; ++i) {                  \
+        (x)[i] = vaddq_f32((x)[i], (x)[offset+i]);      \
+    }                                                   \
+    offset >>= 1;                                       \
+    for (int i = 0; i < offset; ++i) {                  \
+        (x)[i] = vaddq_f32((x)[i], (x)[offset+i]);      \
+    }                                                   \
+    (res) = (wsp_ggml_float) WSP_GGML_F32x4_REDUCE_ONE((x)[0]); \
 }
 #define WSP_GGML_F32_VEC        WSP_GGML_F32x4
@@ -614,7 +584,7 @@ do {                                                                  \
     for (int i = 0; i < offset; ++i) {                                \
         x[i] = _mm512_add_ps(x[i], x[offset+i]);                      \
     }                                                                 \
-    res = _mm512_reduce_add_ps(x[0]);                                 \
+    res = (wsp_ggml_float) _mm512_reduce_add_ps(x[0]);                    \
 } while (0)
 // TODO: is this optimal ?
@@ -664,7 +634,7 @@ do {                                                              \
     for (int i = 0; i < offset; ++i) {                            \
         x[i] = _mm512_add_ps(x[i], x[offset+i]);                  \
     }                                                             \
-    res = _mm512_reduce_add_ps(x[0]);                             \
+    res = (wsp_ggml_float) _mm512_reduce_add_ps(x[0]);                \
 } while (0)
 #define WSP_GGML_F16_VEC                WSP_GGML_F32Cx16
@@ -675,8 +645,8 @@ do {                                                              \
 #define WSP_GGML_F16_VEC_FMA            WSP_GGML_F32Cx16_FMA
 #define WSP_GGML_F16_VEC_ADD            WSP_GGML_F32Cx16_ADD
 #define WSP_GGML_F16_VEC_MUL            WSP_GGML_F32Cx16_MUL
-#define WSP_GGML_F16_VEC_REDUCE         WSP_GGML_F32Cx16_REDUCE
+#define WSP_GGML_F16_VEC_REDUCE         WSP_GGML_F32Cx16_REDUCE
 #elif defined(__AVX__)
 #define WSP_GGML_SIMD
@@ -745,7 +715,7 @@ do {                                                              \
 #define WSP_GGML_F32Cx8_LOAD(x)     _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)(x)))
 #define WSP_GGML_F32Cx8_STORE(x, y) _mm_storeu_si128((__m128i *)(x), _mm256_cvtps_ph(y, 0))
 #else
-static inline __m256 __avx_f32cx8_load(wsp_ggml_fp16_t *x) {
+static inline __m256 __avx_f32cx8_load(const wsp_ggml_fp16_t * x) {
     float tmp[8];
     for (int i = 0; i < 8; i++) {
@@ -1017,7 +987,7 @@ inline static void __wasm_f16x4_store(wsp_ggml_fp16_t * p, v128_t x) {
 #define WSP_GGML_F16_STEP 32
 #define WSP_GGML_F16_EPR  4
-static inline __m128 __sse_f16x4_load(wsp_ggml_fp16_t *x) {
+static inline __m128 __sse_f16x4_load(const wsp_ggml_fp16_t * x) {
     float tmp[4];
     tmp[0] = WSP_GGML_FP16_TO_FP32(x[0]);
@@ -1028,7 +998,7 @@ static inline __m128 __sse_f16x4_load(wsp_ggml_fp16_t *x) {
     return _mm_loadu_ps(tmp);
 }
-static inline void __sse_f16x4_store(wsp_ggml_fp16_t *x, __m128 y) {
+static inline void __sse_f16x4_store(wsp_ggml_fp16_t * x, __m128 y) {
     float arr[4];
     _mm_storeu_ps(arr, y);
@@ -1109,29 +1079,23 @@ do {                                                              \
 #define WSP_GGML_F16_STEP 32
 #define WSP_GGML_F16_EPR  8
-// F16 arithmetic is not supported by AVX, so we use F32 instead
+// F16 arithmetic is not supported by LASX, so we use F32 instead
 #define WSP_GGML_F32Cx8          __m256
 #define WSP_GGML_F32Cx8_ZERO    (__m256)__lasx_xvldi(0)
 #define WSP_GGML_F32Cx8_SET1(x) (__m256)__lasx_xvreplgr2vr_w((x))
 static inline __m256 __lasx_f32cx8_load(const wsp_ggml_fp16_t * x) {
-    float tmp[8];
-    for (int i = 0; i < 8; i++) {
-        tmp[i] = WSP_GGML_FP16_TO_FP32(x[i]);
-    }
-    return (__m256)__lasx_xvld(tmp, 0);
+    __m256i a;
+    memcpy(&a, x, sizeof(wsp_ggml_fp16_t) * 8);
+    a = __lasx_xvpermi_d(a, 0 | (1 << 4));
+    return __lasx_xvfcvtl_s_h(a);
 }
-static inline void __lasx_f32cx8_store(wsp_ggml_fp16_t * x, __m256 y) {
-    float arr[8];
-    __lasx_xvst(y, arr, 0);
-    for (int i = 0; i < 8; i++) {
-        x[i] = WSP_GGML_FP32_TO_FP16(arr[i]);
-    }
+static inline void __lasx_f32cx8_store(wsp_ggml_fp16_t * x, __m256 y) {
+    __m256i a = __lasx_xvfcvt_h_s(y, y);
+    a = __lasx_xvpermi_d(a, 0 | (2 << 2));
+    memcpy(x, &a, sizeof(wsp_ggml_fp16_t) * 8);
 }
 #define WSP_GGML_F32Cx8_LOAD(x)     __lasx_f32cx8_load(x)
 #define WSP_GGML_F32Cx8_STORE(x, y) __lasx_f32cx8_store(x, y)
@@ -1168,28 +1132,28 @@ static inline void __lasx_f32cx8_store(wsp_ggml_fp16_t * x, __m256 y) {
 #define WSP_GGML_F32x4_FMA(a, b, c) __lsx_vfmadd_s(b, c, a)
 #define WSP_GGML_F32x4_ADD     __lsx_vfadd_s
 #define WSP_GGML_F32x4_MUL     __lsx_vfmul_s
-#define WSP_GGML_F32x4_REDUCE(res, x)                                 \
-{                                                                 \
-    int offset = WSP_GGML_F32_ARR >> 1;                               \
-    for (int i = 0; i < offset; ++i) {                            \
-        x[i] = __lsx_vfadd_s(x[i], x[offset+i]);                     \
-    }                                                             \
-    offset >>= 1;                                                 \
-    for (int i = 0; i < offset; ++i) {                            \
-        x[i] = __lsx_vfadd_s(x[i], x[offset+i]);                     \
-    }                                                             \
-    offset >>= 1;                                                 \
-    for (int i = 0; i < offset; ++i) {                            \
-        x[i] = __lsx_vfadd_s(x[i], x[offset+i]);                     \
-    }                                                             \
-    __m128i tmp = __lsx_vsrli_d((__m128i)x[0], 32); \
-    tmp = (__m128i)__lsx_vfadd_s((__m128)tmp, x[0]); \
-    tmp = __lsx_vpickev_w(__lsx_vldi(0), tmp); \
-    const __m128 t0 = __lsx_vshuf4i_w(tmp, 0x88); \
-    tmp = __lsx_vsrli_d((__m128i)t0, 32); \
-    tmp = (__m128i)__lsx_vfadd_s((__m128)tmp, t0); \
-    tmp = __lsx_vpickev_w(__lsx_vldi(0), tmp); \
-    res = (wsp_ggml_float) __lsx_vpickve2gr_w(__lsx_vshuf4i_w(tmp, 0x88), 0);        \
+#define WSP_GGML_F32x4_REDUCE(res, x)                                                     \
+{                                                                                     \
+    int offset = WSP_GGML_F32_ARR >> 1;                                                   \
+    for (int i = 0; i < offset; ++i) {                                                \
+        x[i] = __lsx_vfadd_s(x[i], x[offset + i]);                                    \
+    }                                                                                 \
+    offset >>= 1;                                                                     \
+    for (int i = 0; i < offset; ++i) {                                                \
+        x[i] = __lsx_vfadd_s(x[i], x[offset + i]);                                    \
+    }                                                                                 \
+    offset >>= 1;                                                                     \
+    for (int i = 0; i < offset; ++i) {                                                \
+        x[i] = __lsx_vfadd_s(x[i], x[offset + i]);                                    \
+    }                                                                                 \
+    __m128i tmp     = __lsx_vsrli_d((__m128i) x[0], 32);                              \
+    tmp             = (__m128i) __lsx_vfadd_s((__m128) tmp, x[0]);                    \
+    tmp             = __lsx_vpickev_w(__lsx_vldi(0), tmp);                            \
+    const __m128 t0 = __lsx_vshuf4i_w(tmp, 0x88);                                     \
+    tmp             = __lsx_vsrli_d((__m128i) t0, 32);                                \
+    tmp             = (__m128i) __lsx_vfadd_s((__m128) tmp, t0);                      \
+    tmp             = __lsx_vpickev_w(__lsx_vldi(0), tmp);                            \
+    res             = (wsp_ggml_float) __lsx_vpickve2gr_w(__lsx_vshuf4i_w(tmp, 0x88), 0); \
 }
 #define WSP_GGML_F32_VEC        WSP_GGML_F32x4
@@ -1249,6 +1213,87 @@ static inline void __lsx_f16x4_store(wsp_ggml_fp16_t * x, __m128 y) {
 #define WSP_GGML_F16_VEC_MUL             WSP_GGML_F32Cx4_MUL
 #define WSP_GGML_F16_VEC_REDUCE          WSP_GGML_F32Cx4_REDUCE
+#elif defined(__VXE__) || defined(__VXE2__)
+#define WSP_GGML_SIMD
+// F32 s390x
+#define WSP_GGML_F32_STEP 32
+#define WSP_GGML_F32_EPR  4
+#define WSP_GGML_F32x4              __vector float
+#define WSP_GGML_F32x4_ZERO         vec_splats(0.0f)
+#define WSP_GGML_F32x4_SET1         vec_splats
+#define WSP_GGML_F32x4_LOAD(p)      vec_xl(0, p)
+#define WSP_GGML_F32x4_STORE(p, r)  vec_xst(r, 0, p)
+#define WSP_GGML_F32x4_FMA(a, b, c) vec_madd(b, c, a)
+#define WSP_GGML_F32x4_ADD          vec_add
+#define WSP_GGML_F32x4_MUL          vec_mul
+#define WSP_GGML_F32x4_REDUCE(res, x)                   \
+{                                                   \
+    int offset = WSP_GGML_F32_ARR >> 1;                 \
+    for (int i = 0; i < offset; ++i) {              \
+        x[i] = vec_add(x[i], x[offset + i]);        \
+    }                                               \
+    offset >>= 1;                                   \
+    for (int i = 0; i < offset; ++i) {              \
+        x[i] = vec_add(x[i], x[offset + i]);        \
+    }                                               \
+    offset >>= 1;                                   \
+    for (int i = 0; i < offset; ++i) {              \
+        x[i] = vec_add(x[i], x[offset + i]);        \
+    }                                               \
+    res = vec_extract(x[0], 0) +                    \
+          vec_extract(x[0], 1) +                    \
+          vec_extract(x[0], 2) +                    \
+          vec_extract(x[0], 3);                     \
+}
+#define WSP_GGML_F32_VEC        WSP_GGML_F32x4
+#define WSP_GGML_F32_VEC_ZERO   WSP_GGML_F32x4_ZERO
+#define WSP_GGML_F32_VEC_SET1   WSP_GGML_F32x4_SET1
+#define WSP_GGML_F32_VEC_LOAD   WSP_GGML_F32x4_LOAD
+#define WSP_GGML_F32_VEC_STORE  WSP_GGML_F32x4_STORE
+#define WSP_GGML_F32_VEC_FMA    WSP_GGML_F32x4_FMA
+#define WSP_GGML_F32_VEC_ADD    WSP_GGML_F32x4_ADD
+#define WSP_GGML_F32_VEC_MUL    WSP_GGML_F32x4_MUL
+#define WSP_GGML_F32_VEC_REDUCE WSP_GGML_F32x4_REDUCE
+// F16 s390x
+#define WSP_GGML_F16_STEP WSP_GGML_F32_STEP
+#define WSP_GGML_F16_EPR  WSP_GGML_F32_EPR
+static inline __vector float __lzs_f16cx4_load(const wsp_ggml_fp16_t * x) {
+    float tmp[4];
+    for (int i = 0; i < 4; i++) {
+        tmp[i] = WSP_GGML_FP16_TO_FP32(x[i]);
+    }
+    return vec_xl(0, tmp);
+}
+static inline void __lzs_f16cx4_store(wsp_ggml_fp16_t * x, __vector float y) {
+    float arr[4];
+    vec_xst(y, 0, arr);
+    for (int i = 0; i < 4; i++) {
+        x[i] = WSP_GGML_FP32_TO_FP16(arr[i]);
+    }
+}
+#define WSP_GGML_F16_VEC                WSP_GGML_F32x4
+#define WSP_GGML_F16_VEC_ZERO           WSP_GGML_F32x4_ZERO
+#define WSP_GGML_F16_VEC_SET1           WSP_GGML_F32x4_SET1
+#define WSP_GGML_F16_VEC_LOAD(p, i)     __lzs_f16cx4_load(p)
+#define WSP_GGML_F16_VEC_STORE(p, r, i) __lzs_f16cx4_store(p, r[i])
+#define WSP_GGML_F16_VEC_FMA            WSP_GGML_F32x4_FMA
+#define WSP_GGML_F16_VEC_ADD            WSP_GGML_F32x4_ADD
+#define WSP_GGML_F16_VEC_MUL            WSP_GGML_F32x4_MUL
+#define WSP_GGML_F16_VEC_REDUCE         WSP_GGML_F32x4_REDUCE
 #endif
 // WSP_GGML_F32_ARR / WSP_GGML_F16_ARR
@@ -1328,12 +1373,12 @@ struct wsp_ggml_threadpool {
     atomic_int n_graph;       // incremented when there is work to be done (i.e each graph)
     atomic_int WSP_GGML_CACHE_ALIGN n_barrier;
     atomic_int WSP_GGML_CACHE_ALIGN n_barrier_passed;
-    atomic_int current_chunk; // currently processing chunk during Mat_Mul, shared between all the threads.
+    atomic_int WSP_GGML_CACHE_ALIGN current_chunk; // currently processing chunk during Mat_Mul, shared between all the threads.
     // these are atomic as an annotation for thread-sanitizer
     atomic_bool stop;         // Used for stopping the threadpool altogether
     atomic_bool pause;        // Used for pausing the threadpool or individual threads
-    atomic_bool abort;        // Used for aborting processing of a graph
+    atomic_int abort;         // Used for aborting processing of a graph
     struct wsp_ggml_compute_state * workers;   // per thread state
     int          n_threads_max; // number of threads in the pool
@@ -1357,41 +1402,48 @@ struct wsp_ggml_compute_state {
     int ith;
 };
-struct wsp_ggml_compute_params {
-    // ith = thread index, nth = number of threads
-    int ith, nth;
-    // work buffer for all threads
-    size_t wsize;
-    void * wdata;
-    struct wsp_ggml_threadpool * threadpool;
-};
 //
 // fundamental operations
 //
 inline static void wsp_ggml_vec_set_i8(const int n, int8_t * x, const int8_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
 inline static void wsp_ggml_vec_set_i16(const int n, int16_t * x, const int16_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
-inline static void wsp_ggml_vec_set_i32(const int n, int32_t * x, const int32_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
+inline static void wsp_ggml_vec_set_i32(const int n, int32_t * x, const int32_t   v) { for (int i = 0; i < n; ++i) x[i] = v;    }
+inline static void wsp_ggml_vec_cpy_i32(const int n, int32_t * y, const int32_t * x) { for (int i = 0; i < n; ++i) y[i] = x[i]; }
 inline static void wsp_ggml_vec_set_f16(const int n, wsp_ggml_fp16_t * x, const int32_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
 inline static void wsp_ggml_vec_set_bf16(const int n, wsp_ggml_bf16_t * x, const wsp_ggml_bf16_t v) { for (int i = 0; i < n; ++i) x[i] = v; }
 inline static void wsp_ggml_vec_add_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i]  = x[i] + y[i]; }
+inline static void wsp_ggml_vec_add_f16 (const int n, wsp_ggml_fp16_t * z, const wsp_ggml_fp16_t * x, const wsp_ggml_fp16_t * y) {
+    for (int i = 0; i < n; ++i) {
+        z[i] = WSP_GGML_FP32_TO_FP16(WSP_GGML_FP16_TO_FP32(x[i]) + WSP_GGML_FP16_TO_FP32(y[i]));
+    }
+}
 inline static void wsp_ggml_vec_add1_f32(const int n, float * z, const float * x, const float   v) { for (int i = 0; i < n; ++i) z[i]  = x[i] + v;    }
 inline static void wsp_ggml_vec_acc_f32 (const int n, float * y, const float * x)                  { for (int i = 0; i < n; ++i) y[i] += x[i];        }
 inline static void wsp_ggml_vec_acc1_f32(const int n, float * y, const float   v)                  { for (int i = 0; i < n; ++i) y[i] += v;           }
 inline static void wsp_ggml_vec_sub_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i]  = x[i] - y[i]; }
+inline static void wsp_ggml_vec_sub_f16 (const int n, wsp_ggml_fp16_t * z, const wsp_ggml_fp16_t * x, const wsp_ggml_fp16_t * y) {
+    for (int i = 0; i < n; ++i) {
+        z[i] = WSP_GGML_FP32_TO_FP16(WSP_GGML_FP16_TO_FP32(x[i]) - WSP_GGML_FP16_TO_FP32(y[i]));
+    }
+}
 inline static void wsp_ggml_vec_set_f32 (const int n, float * x, const float   v)                  { for (int i = 0; i < n; ++i) x[i]  = v;           }
 inline static void wsp_ggml_vec_cpy_f32 (const int n, float * y, const float * x)                  { for (int i = 0; i < n; ++i) y[i]  = x[i];        }
 inline static void wsp_ggml_vec_neg_f32 (const int n, float * y, const float * x)                  { for (int i = 0; i < n; ++i) y[i]  = -x[i];       }
 inline static void wsp_ggml_vec_mul_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i]  = x[i]*y[i];   }
+inline static void wsp_ggml_vec_mul_f16 (const int n, wsp_ggml_fp16_t * z, const wsp_ggml_fp16_t * x, const wsp_ggml_fp16_t * y) {
+    for (int i = 0; i < n; ++i) {
+        z[i] = WSP_GGML_FP32_TO_FP16(WSP_GGML_FP16_TO_FP32(x[i]) * WSP_GGML_FP16_TO_FP32(y[i]));
+    }
+}
 inline static void wsp_ggml_vec_div_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i]  = x[i]/y[i];   }
+inline static void wsp_ggml_vec_div_f16 (const int n, wsp_ggml_fp16_t * z, const wsp_ggml_fp16_t * x, const wsp_ggml_fp16_t * y) {
+    for (int i = 0; i < n; ++i) {
+        z[i] = WSP_GGML_FP32_TO_FP16(WSP_GGML_FP16_TO_FP32(x[i]) / WSP_GGML_FP16_TO_FP32(y[i]));
+    }
+}
 static void wsp_ggml_vec_dot_f32(int n, float * restrict s, size_t bs, const float * restrict x, size_t bx, const float * restrict y, size_t by, int nrc) {
    assert(nrc == 1);
@@ -1868,7 +1920,7 @@ inline static float wsp_ggml_silu_f32(float x) {
 #if __FINITE_MATH_ONLY__
 #error "some routines in ggml.c require non-finite math arithmetics -- pass -fno-finite-math-only to the compiler to fix"
-#error "ref: https://github.com/ggerganov/llama.cpp/pull/7154#issuecomment-2143844461"
+#error "ref: https://github.com/ggml-org/llama.cpp/pull/7154#issuecomment-2143844461"
 #endif
 #if defined(__ARM_NEON) && defined(__aarch64__)
@@ -2276,7 +2328,7 @@ struct wsp_ggml_state {
 static struct wsp_ggml_state g_state = {0};
-static void wsp_ggml_barrier(struct wsp_ggml_threadpool * tp) {
+void wsp_ggml_barrier(struct wsp_ggml_threadpool * tp) {
     int n_threads = atomic_load_explicit(&tp->n_threads_cur, memory_order_relaxed);
     if (n_threads == 1) {
         return;
@@ -2430,7 +2482,11 @@ bool wsp_ggml_is_numa(void) {
 #endif
 #if !defined(HWCAP2_I8MM)
-#define HWCAP2_I8MM 0
+#define HWCAP2_I8MM (1 << 13)
+#endif
+#if !defined(HWCAP2_SME)
+#define HWCAP2_SME (1 << 23)
 #endif
 static void wsp_ggml_init_arm_arch_features(void) {
@@ -2438,9 +2494,11 @@ static void wsp_ggml_init_arm_arch_features(void) {
     uint32_t hwcap = getauxval(AT_HWCAP);
     uint32_t hwcap2 = getauxval(AT_HWCAP2);
-    wsp_ggml_arm_arch_features.has_neon = !!(hwcap & HWCAP_ASIMD);
-    wsp_ggml_arm_arch_features.has_i8mm = !!(hwcap2 & HWCAP2_I8MM);
-    wsp_ggml_arm_arch_features.has_sve  = !!(hwcap & HWCAP_SVE);
+    wsp_ggml_arm_arch_features.has_neon    = !!(hwcap & HWCAP_ASIMD);
+    wsp_ggml_arm_arch_features.has_dotprod = !!(hwcap & HWCAP_ASIMDDP);
+    wsp_ggml_arm_arch_features.has_i8mm    = !!(hwcap2 & HWCAP2_I8MM);
+    wsp_ggml_arm_arch_features.has_sve     = !!(hwcap & HWCAP_SVE);
+    wsp_ggml_arm_arch_features.has_sme     = !!(hwcap2 & HWCAP2_SME);
 #if defined(__ARM_FEATURE_SVE)
     wsp_ggml_arm_arch_features.sve_cnt = PR_SVE_VL_LEN_MASK & prctl(PR_SVE_GET_VL);
@@ -2453,11 +2511,21 @@ static void wsp_ggml_init_arm_arch_features(void) {
     }
     wsp_ggml_arm_arch_features.has_neon = oldp;
+    if (sysctlbyname("hw.optional.arm.FEAT_DotProd", &oldp, &size, NULL, 0) != 0) {
+        oldp = 0;
+    }
+    wsp_ggml_arm_arch_features.has_dotprod = oldp;
     if (sysctlbyname("hw.optional.arm.FEAT_I8MM", &oldp, &size, NULL, 0) != 0) {
         oldp = 0;
     }
     wsp_ggml_arm_arch_features.has_i8mm = oldp;
+    if (sysctlbyname("hw.optional.arm.FEAT_SME", &oldp, &size, NULL, 0) != 0) {
+        oldp = 0;
+    }
+    wsp_ggml_arm_arch_features.has_sme = oldp;
     wsp_ggml_arm_arch_features.has_sve = 0;
     wsp_ggml_arm_arch_features.sve_cnt = 0;
 #else
@@ -2481,6 +2549,12 @@ static void wsp_ggml_init_arm_arch_features(void) {
     wsp_ggml_arm_arch_features.has_sve = 0;
     wsp_ggml_arm_arch_features.sve_cnt = 0;
 #endif
+#if defined(__ARM_FEATURE_SME) || defined(__ARM_FEATURE_SME2)
+    wsp_ggml_arm_arch_features.has_sme = 1;
+#else
+    wsp_ggml_arm_arch_features.has_sme = 0;
+#endif
 #endif
 }
 #endif
@@ -4005,6 +4079,57 @@ static void wsp_ggml_compute_forward_dup_bytes(
     }
 }
+static void wsp_ggml_compute_forward_dup_q(
+        const struct wsp_ggml_compute_params * params,
+              struct wsp_ggml_tensor * dst) {
+    const struct wsp_ggml_tensor * src0 = dst->src[0];
+    const struct wsp_ggml_tensor * src1 = dst->src[1];
+    WSP_GGML_TENSOR_BINARY_OP_LOCALS
+    const enum wsp_ggml_type type = src0->type;
+    wsp_ggml_to_float_t const wsp_dewsp_quantize_row_q = wsp_ggml_get_type_traits(type)->to_float;
+    size_t qk = wsp_ggml_blck_size(type);
+    const int64_t nr = wsp_ggml_nelements(src1) / qk;
+    // destination must be contiguous in the first dimension
+    WSP_GGML_ASSERT(nb10 == wsp_ggml_type_size(dst->type));
+    // must either have first dimension large enough to hold a row, or fully contiguous
+    WSP_GGML_ASSERT((ne10 % qk) == 0 || wsp_ggml_is_contiguous(dst));
+    const int ith = params->ith;
+    const int nth = params->nth;
+    const int dr = (nr + nth - 1)/nth;
+    // row range for this thread
+    const int ir0 = dr*ith;
+    const int ir1 = MIN(ir0 + dr, nr);
+    for (int64_t ir = ir0; ir < ir1; ++ir) {
+        uint32_t i = ir * qk;
+        const int64_t i03 = i/(ne00 * ne01 * ne02);
+        const int64_t i02 = (i - i03*ne00*ne01*ne02 )/ (ne00*ne01);
+        const int64_t i01 = (i - i03*ne00*ne01*ne02  -  i02*ne01*ne00) / ne00;
+        const int64_t i00 = i - i03*ne00*ne01*ne02 - i02*ne01*ne00 - i01*ne00;
+        const int64_t x_offset = (i00/qk)*nb00 + i01*nb01 + i02*nb02 + i03 * nb03;
+        const int64_t i13 = i/(ne10 * ne11 * ne12);
+        const int64_t i12 = (i - i13*ne10*ne11*ne12) / (ne10*ne11);
+        const int64_t i11 = (i - i13*ne10*ne11*ne12 - i12*ne10*ne11) / ne10;
+        const int64_t i10 = i - i13*ne10*ne11*ne12 - i12*ne10*ne11 - i11*ne10;
+        const int64_t dst_offset = i10*nb10 + i11*nb11 + i12*nb12 + i13*nb13;
+        wsp_dewsp_quantize_row_q(
+                (const void *) ((char *) src0->data + x_offset),
+                     (float *) ((char *)  dst->data + dst_offset), qk);
+    }
+}
 static void wsp_ggml_compute_forward_dup(
         const struct wsp_ggml_compute_params * params,
         struct wsp_ggml_tensor * dst) {
@@ -4031,6 +4156,10 @@ static void wsp_ggml_compute_forward_dup(
             } break;
         default:
             {
+                if (wsp_ggml_is_quantized(src0->type) && dst->type == WSP_GGML_TYPE_F32) {
+                    wsp_ggml_compute_forward_dup_q(params, dst);
+                    break;
+                }
                 WSP_GGML_ABORT("fatal error");
             }
     }
@@ -4270,7 +4399,7 @@ static void wsp_ggml_compute_forward_add_f16_f16(
     const struct wsp_ggml_tensor * src0 = dst->src[0];
     const struct wsp_ggml_tensor * src1 = dst->src[1];
-    WSP_GGML_ASSERT(wsp_ggml_are_same_shape(src0, src1) && wsp_ggml_are_same_shape(src0, dst));
+    WSP_GGML_ASSERT(wsp_ggml_can_repeat(src1, src0) && wsp_ggml_are_same_shape(src0, dst));
     const int ith = params->ith;
     const int nth = params->nth;
@@ -4295,17 +4424,22 @@ static void wsp_ggml_compute_forward_add_f16_f16(
     if (nb10 == sizeof(wsp_ggml_fp16_t)) {
         for (int ir = ir0; ir < ir1; ++ir) {
-            // src0, src1 and dst are same shape => same indices
-            const int i3 = ir/(ne2*ne1);
-            const int i2 = (ir - i3*ne2*ne1)/ne1;
-            const int i1 = (ir - i3*ne2*ne1 - i2*ne1);
+            // src1 is broadcastable across src0 and dst in i1, i2, i3
+            const int64_t i03 = ir/(ne02*ne01);
+            const int64_t i02 = (ir - i03*ne02*ne01)/ne01;
+            const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01);
+            const int64_t i13 = i03 % ne13;
+            const int64_t i12 = i02 % ne12;
+            const int64_t i11 = i01 % ne11;
+            const int64_t nr0 = ne00 / ne10;
-            wsp_ggml_fp16_t * dst_ptr  = (wsp_ggml_fp16_t *) ((char *) dst->data  + i3*nb3  + i2*nb2  + i1*nb1);
-            wsp_ggml_fp16_t * src0_ptr = (wsp_ggml_fp16_t *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01);
-            wsp_ggml_fp16_t * src1_ptr = (wsp_ggml_fp16_t *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11);
+            wsp_ggml_fp16_t * dst_ptr  = (wsp_ggml_fp16_t *) ((char *) dst->data  + i03*nb3  + i02*nb2  + i01*nb1 );
+            wsp_ggml_fp16_t * src0_ptr = (wsp_ggml_fp16_t *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
+            wsp_ggml_fp16_t * src1_ptr = (wsp_ggml_fp16_t *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11);
-            for (int i = 0; i < ne0; i++) {
-                dst_ptr[i] = WSP_GGML_FP32_TO_FP16(WSP_GGML_FP16_TO_FP32(src0_ptr[i]) + WSP_GGML_FP16_TO_FP32(src1_ptr[i]));
+            for (int64_t r = 0; r < nr0; ++r) {
+                wsp_ggml_vec_add_f16(ne10, dst_ptr + r*ne10, src0_ptr + r*ne10, src1_ptr);
             }
         }
     }
@@ -4505,9 +4639,6 @@ static void wsp_ggml_compute_forward_add(
         case WSP_GGML_TYPE_IQ4_XS:
         case WSP_GGML_TYPE_IQ3_S:
         case WSP_GGML_TYPE_IQ2_S:
-        case WSP_GGML_TYPE_Q4_0_4_4:
-        case WSP_GGML_TYPE_Q4_0_4_8:
-        case WSP_GGML_TYPE_Q4_0_8_8:
             {
                 wsp_ggml_compute_forward_add_q_f32(params, dst);
             } break;
@@ -4885,9 +5016,6 @@ static void wsp_ggml_compute_forward_add1(
         case WSP_GGML_TYPE_IQ4_XS:
         case WSP_GGML_TYPE_IQ3_S:
         case WSP_GGML_TYPE_IQ2_S:
-        case WSP_GGML_TYPE_Q4_0_4_4:
-        case WSP_GGML_TYPE_Q4_0_4_8:
-        case WSP_GGML_TYPE_Q4_0_8_8:
             {
                 wsp_ggml_compute_forward_add1_q_f32(params, dst);
             } break;
@@ -5015,9 +5143,6 @@ static void wsp_ggml_compute_forward_acc(
         case WSP_GGML_TYPE_IQ4_XS:
         case WSP_GGML_TYPE_IQ3_S:
         case WSP_GGML_TYPE_IQ2_S:
-        case WSP_GGML_TYPE_Q4_0_4_4:
-        case WSP_GGML_TYPE_Q4_0_4_8:
-        case WSP_GGML_TYPE_Q4_0_8_8:
         default:
             {
                 WSP_GGML_ABORT("fatal error");
@@ -5102,6 +5227,62 @@ static void wsp_ggml_compute_forward_sub_f32(
     }
 }
+static void wsp_ggml_compute_forward_sub_f16(
+    const struct wsp_ggml_compute_params * params,
+    struct wsp_ggml_tensor * dst) {
+    const struct wsp_ggml_tensor * src0 = dst->src[0];
+    const struct wsp_ggml_tensor * src1 = dst->src[1];
+    assert(wsp_ggml_can_repeat(src1, src0) && wsp_ggml_are_same_shape(src0, dst));
+    const int ith = params->ith;
+    const int nth = params->nth;
+    const int nr  = wsp_ggml_nrows(src0);
+    WSP_GGML_TENSOR_BINARY_OP_LOCALS
+    WSP_GGML_ASSERT(src0->type == WSP_GGML_TYPE_F16);
+    WSP_GGML_ASSERT(src1->type == WSP_GGML_TYPE_F16);
+    WSP_GGML_ASSERT(dst->type  == WSP_GGML_TYPE_F16);
+    WSP_GGML_ASSERT( nb0 == sizeof(wsp_ggml_fp16_t));
+    WSP_GGML_ASSERT(nb00 == sizeof(wsp_ggml_fp16_t));
+    // rows per thread
+    const int dr = (nr + nth - 1)/nth;
+    // row range for this thread
+    const int ir0 = dr*ith;
+    const int ir1 = MIN(ir0 + dr, nr);
+    if (nb10 == sizeof(wsp_ggml_fp16_t)) {
+        for (int ir = ir0; ir < ir1; ++ir) {
+            // src1 is broadcastable across src0 and dst in i1, i2, i3
+            const int64_t i03 = ir/(ne02*ne01);
+            const int64_t i02 = (ir - i03*ne02*ne01)/ne01;
+            const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01);
+            const int64_t i13 = i03 % ne13;
+            const int64_t i12 = i02 % ne12;
+            const int64_t i11 = i01 % ne11;
+            const int64_t nr0 = ne00 / ne10;
+            wsp_ggml_fp16_t * dst_ptr  = (wsp_ggml_fp16_t *) ((char *) dst->data  + i03*nb3  + i02*nb2  + i01*nb1 );
+            wsp_ggml_fp16_t * src0_ptr = (wsp_ggml_fp16_t *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
+            wsp_ggml_fp16_t * src1_ptr = (wsp_ggml_fp16_t *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11);
+            for (int64_t r = 0; r < nr0; ++r) {
+                wsp_ggml_vec_sub_f16(ne10, dst_ptr + r*ne10, src0_ptr + r*ne10, src1_ptr);
+            }
+        }
+    } else {
+        // src1 is not contiguous
+        WSP_GGML_ABORT("unimplemented error");
+    }
+}
 static void wsp_ggml_compute_forward_sub(
         const struct wsp_ggml_compute_params * params,
         struct wsp_ggml_tensor * dst) {
@@ -5113,6 +5294,10 @@ static void wsp_ggml_compute_forward_sub(
             {
                 wsp_ggml_compute_forward_sub_f32(params, dst);
             } break;
+        case WSP_GGML_TYPE_F16:
+            {
+                wsp_ggml_compute_forward_sub_f16(params, dst);
+            } break;
         default:
             {
                 WSP_GGML_ABORT("fatal error");
@@ -5193,32 +5378,9 @@ static void wsp_ggml_compute_forward_mul_f32(
     }
 }
-static void wsp_ggml_compute_forward_mul(
-        const struct wsp_ggml_compute_params * params,
-        struct wsp_ggml_tensor * dst) {
-    const struct wsp_ggml_tensor * src0 = dst->src[0];
-    const struct wsp_ggml_tensor * src1 = dst->src[1];
-    WSP_GGML_ASSERT(src1->type == WSP_GGML_TYPE_F32 && "only f32 src1 supported for now");
-    switch (src0->type) {
-        case WSP_GGML_TYPE_F32:
-            {
-                wsp_ggml_compute_forward_mul_f32(params, dst);
-            } break;
-        default:
-            {
-                WSP_GGML_ABORT("fatal error");
-            }
-    }
-}
-// wsp_ggml_compute_forward_div
-static void wsp_ggml_compute_forward_div_f32(
-        const struct wsp_ggml_compute_params * params,
-        struct wsp_ggml_tensor * dst) {
+static void wsp_ggml_compute_forward_mul_f16(
+    const struct wsp_ggml_compute_params * params,
+    struct wsp_ggml_tensor * dst) {
     const struct wsp_ggml_tensor * src0 = dst->src[0];
     const struct wsp_ggml_tensor * src1 = dst->src[1];
@@ -5232,8 +5394,84 @@ static void wsp_ggml_compute_forward_div_f32(
     WSP_GGML_TENSOR_BINARY_OP_LOCALS
-    WSP_GGML_ASSERT( nb0 == sizeof(float));
-    WSP_GGML_ASSERT(nb00 == sizeof(float));
+    WSP_GGML_ASSERT(src0->type == WSP_GGML_TYPE_F16);
+    WSP_GGML_ASSERT(src1->type == WSP_GGML_TYPE_F16);
+    WSP_GGML_ASSERT(dst->type  == WSP_GGML_TYPE_F16);
+    WSP_GGML_ASSERT( nb0 == sizeof(wsp_ggml_fp16_t));
+    WSP_GGML_ASSERT(nb00 == sizeof(wsp_ggml_fp16_t));
+    if (nb10 == sizeof(wsp_ggml_fp16_t)) {
+        for (int64_t ir = ith; ir < nr; ir += nth) {
+            // src0 and dst are same shape => same indices
+            const int64_t i03 = ir/(ne02*ne01);
+            const int64_t i02 = (ir - i03*ne02*ne01)/ne01;
+            const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01);
+            const int64_t i13 = i03 % ne13;
+            const int64_t i12 = i02 % ne12;
+            const int64_t i11 = i01 % ne11;
+            const int64_t nr0 = ne00 / ne10;
+            wsp_ggml_fp16_t * dst_ptr  = (wsp_ggml_fp16_t *) ((char *) dst->data  + i03*nb3  + i02*nb2  + i01*nb1 );
+            wsp_ggml_fp16_t * src0_ptr = (wsp_ggml_fp16_t *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
+            wsp_ggml_fp16_t * src1_ptr = (wsp_ggml_fp16_t *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11);
+            for (int64_t r = 0 ; r < nr0; ++r) {
+                wsp_ggml_vec_mul_f16(ne10, dst_ptr + r*ne10, src0_ptr + r*ne10, src1_ptr);
+            }
+        }
+    } else {
+        // src1 is not contiguous
+        WSP_GGML_ABORT("unimplemented error");
+    }
+}
+static void wsp_ggml_compute_forward_mul(
+        const struct wsp_ggml_compute_params * params,
+        struct wsp_ggml_tensor * dst) {
+    const struct wsp_ggml_tensor * src0 = dst->src[0];
+    const struct wsp_ggml_tensor * src1 = dst->src[1];
+    WSP_GGML_ASSERT((src1->type == WSP_GGML_TYPE_F32 || src1->type == WSP_GGML_TYPE_F16) && "only f32/f16 src1 supported for now");
+    switch (src0->type) {
+        case WSP_GGML_TYPE_F32:
+            {
+                wsp_ggml_compute_forward_mul_f32(params, dst);
+            } break;
+        case WSP_GGML_TYPE_F16:
+            {
+                wsp_ggml_compute_forward_mul_f16(params, dst);
+            } break;
+        default:
+            {
+                WSP_GGML_ABORT("fatal error");
+            }
+    }
+}
+// wsp_ggml_compute_forward_div
+static void wsp_ggml_compute_forward_div_f32(
+        const struct wsp_ggml_compute_params * params,
+        struct wsp_ggml_tensor * dst) {
+    const struct wsp_ggml_tensor * src0 = dst->src[0];
+    const struct wsp_ggml_tensor * src1 = dst->src[1];
+    WSP_GGML_ASSERT(wsp_ggml_can_repeat(src1, src0) && wsp_ggml_are_same_shape(src0, dst));
+    const int ith = params->ith;
+    const int nth = params->nth;
+    const int64_t nr = wsp_ggml_nrows(src0);
+    WSP_GGML_TENSOR_BINARY_OP_LOCALS
+    WSP_GGML_ASSERT( nb0 == sizeof(float));
+    WSP_GGML_ASSERT(nb00 == sizeof(float));
     if (nb10 == sizeof(float)) {
         for (int64_t ir = ith; ir < nr; ir += nth) {
@@ -5287,6 +5525,55 @@ static void wsp_ggml_compute_forward_div_f32(
     }
 }
+static void wsp_ggml_compute_forward_div_f16(
+    const struct wsp_ggml_compute_params * params,
+    struct wsp_ggml_tensor * dst) {
+    const struct wsp_ggml_tensor * src0 = dst->src[0];
+    const struct wsp_ggml_tensor * src1 = dst->src[1];
+    WSP_GGML_ASSERT(wsp_ggml_can_repeat(src1, src0) && wsp_ggml_are_same_shape(src0, dst));
+    const int ith = params->ith;
+    const int nth = params->nth;
+    const int64_t nr = wsp_ggml_nrows(src0);
+    WSP_GGML_TENSOR_BINARY_OP_LOCALS
+    WSP_GGML_ASSERT(src0->type == WSP_GGML_TYPE_F16);
+    WSP_GGML_ASSERT(src1->type == WSP_GGML_TYPE_F16);
+    WSP_GGML_ASSERT(dst->type  == WSP_GGML_TYPE_F16);
+    WSP_GGML_ASSERT( nb0 == sizeof(wsp_ggml_fp16_t));
+    WSP_GGML_ASSERT(nb00 == sizeof(wsp_ggml_fp16_t));
+    if (nb10 == sizeof(wsp_ggml_fp16_t)) {
+        for (int64_t ir = ith; ir < nr; ir += nth) {
+            // src0 and dst are same shape => same indices
+            const int64_t i03 = ir/(ne02*ne01);
+            const int64_t i02 = (ir - i03*ne02*ne01)/ne01;
+            const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01);
+            const int64_t i13 = i03 % ne13;
+            const int64_t i12 = i02 % ne12;
+            const int64_t i11 = i01 % ne11;
+            const int64_t nr0 = ne00 / ne10;
+            wsp_ggml_fp16_t * dst_ptr  = (wsp_ggml_fp16_t *) ((char *) dst->data  + i03*nb3  + i02*nb2  + i01*nb1 );
+            wsp_ggml_fp16_t * src0_ptr = (wsp_ggml_fp16_t *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
+            wsp_ggml_fp16_t * src1_ptr = (wsp_ggml_fp16_t *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11);
+            for (int64_t r = 0; r < nr0; ++r) {
+                wsp_ggml_vec_div_f16(ne10, dst_ptr + r*ne10, src0_ptr + r*ne10, src1_ptr);
+            }
+        }
+    } else {
+        // src1 is not contiguous
+        WSP_GGML_ABORT("unimplemented error");
+    }
+}
 static void wsp_ggml_compute_forward_div(
         const struct wsp_ggml_compute_params * params,
         struct wsp_ggml_tensor * dst) {
@@ -5298,6 +5585,10 @@ static void wsp_ggml_compute_forward_div(
             {
                 wsp_ggml_compute_forward_div_f32(params, dst);
             } break;
+        case WSP_GGML_TYPE_F16:
+            {
+                wsp_ggml_compute_forward_div_f16(params, dst);
+            } break;
         default:
             {
                 WSP_GGML_ABORT("fatal error");
@@ -6738,20 +7029,20 @@ static void wsp_ggml_compute_forward_silu_back_f32(
         const struct wsp_ggml_compute_params * params,
         struct wsp_ggml_tensor * dst) {
-    const struct wsp_ggml_tensor * src0 = dst->src[0];
-    const struct wsp_ggml_tensor * grad = dst->src[1];
+    const struct wsp_ggml_tensor * grad = dst->src[0];
+    const struct wsp_ggml_tensor * src1 = dst->src[1];
     assert(wsp_ggml_is_contiguous_1(grad));
-    assert(wsp_ggml_is_contiguous_1(src0));
+    assert(wsp_ggml_is_contiguous_1(src1));
     assert(wsp_ggml_is_contiguous_1(dst));
-    assert(wsp_ggml_are_same_shape(src0, dst));
-    assert(wsp_ggml_are_same_shape(src0, grad));
+    assert(wsp_ggml_are_same_shape(src1, dst));
+    assert(wsp_ggml_are_same_shape(src1, grad));
     const int ith = params->ith;
     const int nth = params->nth;
-    const int nc = src0->ne[0];
-    const int nr = wsp_ggml_nrows(src0);
+    const int nc = src1->ne[0];
+    const int nr = wsp_ggml_nrows(src1);
     // rows per thread
     const int dr = (nr + nth - 1)/nth;
@@ -6763,7 +7054,7 @@ static void wsp_ggml_compute_forward_silu_back_f32(
     for (int i1 = ir0; i1 < ir1; i1++) {
         wsp_ggml_vec_silu_backward_f32(nc,
                 (float *) ((char *) dst->data  + i1*( dst->nb[1])),
-                (float *) ((char *) src0->data + i1*(src0->nb[1])),
+                (float *) ((char *) src1->data + i1*(src1->nb[1])),
                 (float *) ((char *) grad->data + i1*(grad->nb[1])));
 #ifndef NDEBUG
@@ -6942,7 +7233,7 @@ static void wsp_ggml_compute_forward_norm_f32(
     float eps;
     memcpy(&eps, dst->op_params, sizeof(float));
-    WSP_GGML_ASSERT(eps > 0.0f);
+    WSP_GGML_ASSERT(eps >= 0.0f);
     // TODO: optimize
     for (int64_t i03 = 0; i03 < ne03; i03++) {
@@ -7013,7 +7304,7 @@ static void wsp_ggml_compute_forward_rms_norm_f32(
     float eps;
     memcpy(&eps, dst->op_params, sizeof(float));
-    WSP_GGML_ASSERT(eps > 0.0f);
+    WSP_GGML_ASSERT(eps >= 0.0f);
     // TODO: optimize
     for (int64_t i03 = 0; i03 < ne03; i03++) {
@@ -7065,12 +7356,13 @@ static void wsp_ggml_compute_forward_rms_norm_back_f32(
         const struct wsp_ggml_compute_params * params,
         struct wsp_ggml_tensor * dst) {
-    const struct wsp_ggml_tensor * src0 = dst->src[0];
-    const struct wsp_ggml_tensor * src1 = dst->src[1];
+    const struct wsp_ggml_tensor * src0 = dst->src[0]; // gradients from forward pass output
+    const struct wsp_ggml_tensor * src1 = dst->src[1]; // src1 from forward pass
     WSP_GGML_ASSERT(wsp_ggml_are_same_shape(src0, dst) && wsp_ggml_are_same_shape(src0, src1));
     WSP_GGML_ASSERT(src0->nb[0] == sizeof(float));
+    WSP_GGML_ASSERT(src1->nb[0] == sizeof(float));
     const int ith = params->ith;
     const int nth = params->nth;
@@ -7089,8 +7381,8 @@ static void wsp_ggml_compute_forward_rms_norm_back_f32(
                 const int64_t i12 = i02;
                 const int64_t i13 = i03;
-                const float * x = (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03);
-                const float * dz = (float *) ((char *) src1->data + i11*nb11 + i12*nb12 + i13*nb13);
+                const float * dz = (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03);
+                const float * x  = (float *) ((char *) src1->data + i11*nb11 + i12*nb12 + i13*nb13);
                 wsp_ggml_float sum_xx  = 0.0;
                 wsp_ggml_float sum_xdz = 0.0;
@@ -7113,9 +7405,9 @@ static void wsp_ggml_compute_forward_rms_norm_back_f32(
                 {
                     // z = rms_norm(x)
                     //
-                    // rms_norm(src0) =
+                    // rms_norm(src1) =
                     //     scale(
-                    //         src0,
+                    //         src1,
                     //         div(
                     //             1,
                     //             sqrt(
@@ -7123,13 +7415,13 @@ static void wsp_ggml_compute_forward_rms_norm_back_f32(
                     //                     scale(
                     //                         sum(
                     //                             sqr(
-                    //                                 src0)),
+                    //                                 src1)),
                     //                         (1.0/N)),
                     //                     eps))));
                     // postorder:
                     // ## op    args         grad
-                    // 00 param src0         grad[#00]
+                    // 00 param src1         grad[#00]
                     // 01 const 1
                     // 02 sqr   (#00)        grad[#02]
                     // 03 sum   (#02)        grad[#03]
@@ -7206,6 +7498,7 @@ static void wsp_ggml_compute_forward_rms_norm_back_f32(
                 // dx := scale(dx, rrms)
                 float * dx = (float *) ((char *) dst->data + i01*nb1 + i02*nb2 + i03*nb3);
+                // dx[i00] = (x*(-sum_xdz/sum_eps) + dz) / sqrtf(mean_eps)
                 wsp_ggml_vec_cpy_f32  (ne00, dx, x);
                 // wsp_ggml_vec_scale_f32(ne00, dx, -mean_xdz/mean_eps);
                 wsp_ggml_vec_scale_f32(ne00, dx, (float)(-sum_xdz)/sum_eps);
@@ -7433,20 +7726,9 @@ static void wsp_ggml_compute_forward_mul_mat(
     const int ith = params->ith;
     const int nth = params->nth;
-    enum wsp_ggml_type type = src0->type;
-    if (src0->buffer && wsp_ggml_backend_cpu_buft_is_aarch64(src0->buffer->buft)) {
-        type = (enum wsp_ggml_type)(intptr_t)src0->extra;
-    }
-    enum wsp_ggml_type           const vec_dot_type         = type_traits_cpu[type].vec_dot_type;
+    enum wsp_ggml_type           const vec_dot_type         = type_traits_cpu[src0->type].vec_dot_type;
     wsp_ggml_from_float_t        const from_float           = type_traits_cpu[vec_dot_type].from_float;
-    wsp_ggml_from_float_to_mat_t const from_float_to_mat    = type_traits_cpu[vec_dot_type].from_float_to_mat;
-    int64_t                  const vec_dot_num_rows     = type_traits_cpu[type].nrows;
-    int64_t                  const matmul_num_cols      = type_traits_cpu[type].ncols;
-    int64_t                  const blck_size_interleave = wsp_ggml_get_type_traits(type)->blck_size_interleave;
-    wsp_ggml_gemv_t              const gemv                 = type_traits_cpu[type].gemv;
-    wsp_ggml_gemm_t              const gemm                 = type_traits_cpu[type].gemm;
+    int64_t                  const vec_dot_num_rows     = type_traits_cpu[src0->type].nrows;
     WSP_GGML_ASSERT(ne0 == ne01);
     WSP_GGML_ASSERT(ne1 == ne11);
@@ -7454,7 +7736,7 @@ static void wsp_ggml_compute_forward_mul_mat(
     WSP_GGML_ASSERT(ne3 == ne13);
     // we don't support permuted src0 or src1
-    WSP_GGML_ASSERT(nb00 == wsp_ggml_type_size(type));
+    WSP_GGML_ASSERT(nb00 == wsp_ggml_type_size(src0->type));
     WSP_GGML_ASSERT(nb10 == wsp_ggml_type_size(src1->type));
     // dst cannot be transposed or permuted
@@ -7466,6 +7748,7 @@ static void wsp_ggml_compute_forward_mul_mat(
     // nb01 >= nb00 - src0 is not transposed
     //   compute by src0 rows
+    // TODO: extract to "extra_op"
 #if WSP_GGML_USE_LLAMAFILE
     // broadcast factors
     const int64_t r2 = ne12 / ne02;
@@ -7476,15 +7759,15 @@ static void wsp_ggml_compute_forward_mul_mat(
     if (src1_cont) {
         for (int64_t i13 = 0; i13 < ne13; i13++)
             for (int64_t i12 = 0; i12 < ne12; i12++)
-                if (!llamafile_sgemm(ne01, ne11, ne00/wsp_ggml_blck_size(type),
+                if (!llamafile_sgemm(params,
+                                     ne01, ne11, ne00/wsp_ggml_blck_size(src0->type),
                                      (const char *)src0->data + i12/r2*nb02 + i13/r3*nb03,
-                                     nb01/wsp_ggml_type_size(type),
+                                     nb01/wsp_ggml_type_size(src0->type),
                                      (const char *)src1->data + i12*nb12 + i13*nb13,
                                      nb11/wsp_ggml_type_size(src1->type),
                                      (char *)dst->data + i12*nb2 + i13*nb3,
                                      nb1/wsp_ggml_type_size(dst->type),
-                                     ith, nth,
-                                     type,
+                                     src0->type,
                                      src1->type,
                                      dst->type))
                     goto UseGgmlGemm1;
@@ -7496,6 +7779,7 @@ UseGgmlGemm1:;
     if (src1->type != vec_dot_type) {
         char * wdata = params->wdata;
+        const size_t nbw0 = wsp_ggml_type_size(vec_dot_type);
         const size_t nbw1 = wsp_ggml_row_size(vec_dot_type, ne10);
         const size_t nbw2 = nbw1*ne11;
         const size_t nbw3 = nbw2*ne12;
@@ -7503,24 +7787,30 @@ UseGgmlGemm1:;
         assert(params->wsize >= ne13*nbw3);
         WSP_GGML_ASSERT(src1->type == WSP_GGML_TYPE_F32);
+    #if 0
         for (int64_t i13 = 0; i13 < ne13; ++i13) {
             for (int64_t i12 = 0; i12 < ne12; ++i12) {
-                int64_t i11_processed = 0;
-                if ((wsp_ggml_n_dims(src1) == 2) && from_float_to_mat && gemm) {
-                    for (int64_t i11 = ith * 4; i11 < ne11 - ne11 % 4; i11 += nth * 4) {
-                        from_float_to_mat((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11),
-                                          (void *)               (wdata + i13*nbw3 + i12*nbw2 + i11*nbw1),
-                                          4, ne10, blck_size_interleave);
-                    }
-                    i11_processed = ne11 - ne11 % 4;
-                }
-                for (int64_t i11 = i11_processed + ith; i11 < ne11; i11 += nth) {
+                for (int64_t i11 = ith; i11 < ne11; i11 += nth) {
                     from_float((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11),
-                           (void *)               (wdata + i13*nbw3 + i12*nbw2 + i11*nbw1),
-                           ne10);
+                               (void *)               (wdata + i13*nbw3 + i12*nbw2 + i11*nbw1),
+                                ne10);
+                }
+            }
+        }
+    #else
+        for (int64_t i13 = 0; i13 < ne13; ++i13) {
+            for (int64_t i12 = 0; i12 < ne12; ++i12) {
+                for (int64_t i11 = 0; i11 < ne11; ++i11) {
+                    size_t bs = wsp_ggml_blck_size(vec_dot_type);
+                    int64_t ne10_block_start = (ith * ne10/bs) / nth;
+                    int64_t ne10_block_end   = ((ith + 1) * ne10/bs) / nth;
+                    from_float((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11 + ne10_block_start*bs*nb10),
+                               (void *)               (wdata + i13*nbw3 + i12*nbw2 + i11*nbw1 + ne10_block_start*nbw0),
+                               (ne10_block_end - ne10_block_start) * bs);
                 }
             }
         }
+    #endif
     }
     if (ith == 0) {
@@ -7537,15 +7827,15 @@ UseGgmlGemm1:;
         for (int64_t i13 = 0; i13 < ne13; i13++)
             for (int64_t i12 = 0; i12 < ne12; i12++)
-                if (!llamafile_sgemm(ne01, ne11, ne00/wsp_ggml_blck_size(type),
+                if (!llamafile_sgemm(params,
+                                     ne01, ne11, ne00/wsp_ggml_blck_size(src0->type),
                                      (const char *)src0->data + i12/r2*nb02 + i13/r3*nb03,
-                                     nb01/wsp_ggml_type_size(type),
+                                     nb01/wsp_ggml_type_size(src0->type),
                                      (const char *)wdata + (i12*ne11 + i13*ne12*ne11)*row_size,
                                      row_size/wsp_ggml_type_size(vec_dot_type),
                                      (char *)dst->data + i12*nb2 + i13*nb3,
                                      nb1/wsp_ggml_type_size(dst->type),
-                                     ith, nth,
-                                     type,
+                                     src0->type,
                                      vec_dot_type,
                                      dst->type))
                     goto UseGgmlGemm2;
@@ -7560,14 +7850,6 @@ UseGgmlGemm2:;
     // This is the size of the rest of the dimensions of the result
     const int64_t nr1 = ne1 * ne2 * ne3;
-    // dot kernels can handle 1 row and col at a time, but mmla kernels can process 2 rows and cols
-    int64_t num_rows_per_vec_dot = vec_dot_num_rows;
-    // TODO: currently the mmla kernels support only even numbered rows/cols.
-    // this check can be removed once they are extended to support odd numbered rows/cols too
-    if ((nr0 % 2 != 0) || (ne11 % 2 != 0)) {
-        num_rows_per_vec_dot = 1;
-    }
     // Now select a reasonable chunk size.
     int chunk_size = 16;
@@ -7583,7 +7865,7 @@ UseGgmlGemm2:;
     int64_t nchunk1 = (nr1 + chunk_size - 1) / chunk_size;
     // If the chunking is poor for the number of threads on this setup, scrap the whole plan.  Re-chunk it by thread.
-    //   Also, chunking by thread was measured to have perform better on NUMA systems.  See https://github.com/ggerganov/llama.cpp/pull/6915
+    //   Also, chunking by thread was measured to have perform better on NUMA systems.  See https://github.com/ggml-org/llama.cpp/pull/6915
     //   In theory, chunking should be just as useful on NUMA and non NUMA systems, but testing disagreed with that.
     if (nchunk0 * nchunk1 < nth * 4 || wsp_ggml_is_numa()) {
         // distribute the thread work across the inner or outer loop based on which one is larger
@@ -7595,28 +7877,6 @@ UseGgmlGemm2:;
     const int64_t dr0 = (nr0 + nchunk0 - 1) / nchunk0;
     const int64_t dr1 = (nr1 + nchunk1 - 1) / nchunk1;
-    if ((wsp_ggml_n_dims(src0) == 2) && gemv) {
-        const void * src1_wdata      = (src1->type == vec_dot_type) ? src1->data : params->wdata;
-        const size_t src1_col_stride = wsp_ggml_is_contiguous(src1) || src1->type != vec_dot_type ? wsp_ggml_row_size(vec_dot_type, ne10) : nb11;
-        int64_t src0_start = (ith * ne01) / nth;
-        int64_t src0_end   = ((ith + 1) * ne01) / nth;
-        src0_start = (src0_start % matmul_num_cols) ? src0_start + matmul_num_cols - (src0_start % matmul_num_cols): src0_start;
-        src0_end   = (src0_end   % matmul_num_cols) ? src0_end   + matmul_num_cols - (src0_end   % matmul_num_cols): src0_end;
-        if (src0_start >= src0_end) return;
-        // If there are more than three rows in src1, use gemm; otherwise, use gemv.
-        if (gemm && (ne11 > 3)) {
-            gemm(ne00, (float *)((char *) dst->data) + src0_start, ne01, (const char *) src0->data + src0_start * nb01,
-                 (const char *) src1_wdata, ne11 - ne11 % 4, src0_end - src0_start);
-        }
-        for (int iter = gemm ? ne11 - ne11 % 4 : 0; iter < ne11; iter++) {
-            gemv(ne00, (float *)((char *) dst->data + (iter * nb1)) + src0_start, ne01,
-                 (const char *) src0->data + src0_start * nb01, (const char *) src1_wdata + (src1_col_stride * iter), 1,
-                 src0_end - src0_start);
-        }
-        return;
-    }
     // The first chunk comes from our thread_id, the rest will get auto-assigned.
     int current_chunk = ith;
@@ -7630,7 +7890,15 @@ UseGgmlGemm2:;
         const int64_t ir1_start = dr1 * ith1;
         const int64_t ir1_end = MIN(ir1_start + dr1, nr1);
-        wsp_ggml_compute_forward_mul_mat_one_chunk(params, dst, type, num_rows_per_vec_dot, ir0_start, ir0_end, ir1_start, ir1_end);
+        // dot kernels can handle 1 row and col at a time, but mmla kernels can process 2 rows and cols
+        int64_t num_rows_per_vec_dot = vec_dot_num_rows;
+        // these checks are needed to avoid crossing dim1 boundaries
+        // can be optimized, but the logic would become more complicated, so keeping it like this for simplicity
+        if ((nr0 % 2 != 0) || (ne11 % 2 != 0) || ((ir0_end - ir0_start) % 2 != 0) || ((ir1_end - ir1_start) % 2 != 0)) {
+            num_rows_per_vec_dot = 1;
+        }
+        wsp_ggml_compute_forward_mul_mat_one_chunk(params, dst, src0->type, num_rows_per_vec_dot, ir0_start, ir0_end, ir1_start, ir1_end);
         if (nth >= nchunk0 * nchunk1) {
             break;
@@ -7642,6 +7910,84 @@ UseGgmlGemm2:;
 // wsp_ggml_compute_forward_mul_mat_id
+#define MMID_MATRIX_ROW(row_id, i1) matrix_rows[(row_id)*ids->ne[0]*ids->ne[1] + (i1)]
+struct mmid_row_mapping {
+    int32_t i1;
+    int32_t i2;
+};
+static void wsp_ggml_compute_forward_mul_mat_id_one_chunk(
+    struct wsp_ggml_tensor * dst,
+    const struct wsp_ggml_tensor * src0,
+    const struct wsp_ggml_tensor * src1,
+    const struct wsp_ggml_tensor * ids,
+    const int64_t cur_a,
+    const int64_t ir0_start,
+    const int64_t ir0_end,
+    const int64_t ir1_start,
+    const int64_t ir1_end,
+    const char * src0_cur,
+    const struct mmid_row_mapping * matrix_rows,
+    const size_t row_size,
+    const bool src1_cont,
+    const void * wdata) {
+    WSP_GGML_TENSOR_BINARY_OP_LOCALS
+    const enum wsp_ggml_type type = src0->type;
+    wsp_ggml_vec_dot_t    const vec_dot      = type_traits_cpu[type].vec_dot;
+    enum wsp_ggml_type    const vec_dot_type = type_traits_cpu[type].vec_dot_type;
+    const int64_t blck_0 = 16;
+    const int64_t blck_1 = 16;
+    float tmp[16];
+    for (int64_t iir1 = ir1_start; iir1 < ir1_end; iir1 += blck_1) {
+        for (int64_t iir0 = ir0_start; iir0 < ir0_end; iir0 += blck_0) {
+            for (int64_t ir1 = iir1; ir1 < iir1 + blck_1 && ir1 < ir1_end; ++ir1) {
+                const int64_t _i12 = ir1; // logical row index for this expert
+                struct mmid_row_mapping row_mapping = MMID_MATRIX_ROW(cur_a, _i12);
+                const int id       = row_mapping.i1; // selected expert index
+                const int64_t  i11 = id % ne11;
+                const int64_t  i12 = row_mapping.i2; // row index in src1
+                const int64_t  i1 = id;  // selected expert index
+                const int64_t  i2 = i12; // row
+                // desc: when src1 is not a contiguous memory block we have to calculate the offset using the strides
+                //       if it is, then we have either copied the data to params->wdata and made it contiguous or we are using
+                //       the original src1 data pointer, so we should index using the indices directly
+                // TODO: this is a bit of a hack, we should probably have a better way to handle this
+                const char * src1_col = (const char *) wdata +
+                    (src1_cont || src1->type != vec_dot_type
+                    ? (i11      + i12*ne11)*row_size
+                    : (i11*nb11 + i12*nb12));
+                float * dst_col = (float *) ((char *) dst->data + (i1*nb1 + i2*nb2));
+                for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir0_end; ++ir0) {
+                    vec_dot(ne00, &tmp[ir0 - iir0], 0, src0_cur + ir0*nb01, 0, src1_col, 0, 1);
+                }
+                memcpy(&dst_col[iir0], tmp, (MIN(iir0 + blck_0, ir0_end) - iir0)*sizeof(float));
+            }
+        }
+    }
+}
+static void * incr_ptr_aligned(void ** p, size_t size, size_t align) {
+    void * ptr = *p;
+    ptr = (void *) WSP_GGML_PAD((uintptr_t) ptr, align);
+    *p = (void *) ((char *) ptr + size);
+    return ptr;
+}
 static void wsp_ggml_compute_forward_mul_mat_id(
         const struct wsp_ggml_compute_params * params,
               struct wsp_ggml_tensor * dst) {
@@ -7659,11 +8005,8 @@ static void wsp_ggml_compute_forward_mul_mat_id(
     const bool src1_cont = wsp_ggml_is_contiguous(src1);
-    wsp_ggml_vec_dot_t    const vec_dot         = type_traits_cpu[type].vec_dot;
     enum wsp_ggml_type    const vec_dot_type    = type_traits_cpu[type].vec_dot_type;
     wsp_ggml_from_float_t const from_float      = type_traits_cpu[vec_dot_type].from_float;
-    int64_t           const matmul_num_cols = type_traits_cpu[type].ncols;
-    wsp_ggml_gemv_t       const gemv            = type_traits_cpu[type].gemv;
     // we don't support permuted src0 or src1
     WSP_GGML_ASSERT(nb00 == wsp_ggml_type_size(type));
@@ -7679,21 +8022,27 @@ static void wsp_ggml_compute_forward_mul_mat_id(
     const int n_ids = ids->ne[0]; // n_expert_used
     const int n_as  = ne02;       // n_expert
-    char * wdata_src1_end = (src1->type == vec_dot_type) ?
-            (char *) params->wdata :
-            (char *) params->wdata + WSP_GGML_PAD(wsp_ggml_row_size(vec_dot_type, wsp_ggml_nelements(src1)), sizeof(int64_t));
+    void * wdata_cur = params->wdata;
-    struct mmid_row_mapping {
-        int32_t i1;
-        int32_t i2;
-    };
+    if (src1->type != vec_dot_type) {
+        incr_ptr_aligned(&wdata_cur, wsp_ggml_row_size(vec_dot_type, wsp_ggml_nelements(src1)), sizeof(int64_t));
+    }
+    int64_t * matrix_row_counts = // [n_as]
+        incr_ptr_aligned(&wdata_cur, n_as*sizeof(int64_t), sizeof(int64_t));
+    struct mmid_row_mapping * matrix_rows = // [n_as][ids->ne[0]*ids->ne[1]]
+        incr_ptr_aligned(&wdata_cur, n_as*ids->ne[0]*ids->ne[1]*sizeof(struct mmid_row_mapping), sizeof(int64_t));
-    int64_t * matrix_row_counts = (int64_t *) (wdata_src1_end); // [n_as]
-    struct mmid_row_mapping * matrix_rows = (struct mmid_row_mapping *)(matrix_row_counts + n_as); // [n_as][ne11]
+    char (*atomic_current_chunk)[CACHE_LINE_SIZE] = // [n_as]
+        incr_ptr_aligned(&wdata_cur, CACHE_LINE_SIZE * n_as, CACHE_LINE_SIZE);
+    WSP_GGML_ASSERT(params->wsize >= (size_t)((char *) wdata_cur - (char *) params->wdata));
     if (src1->type != vec_dot_type) {
         char * wdata = params->wdata;
+        const size_t nbw0 = wsp_ggml_type_size(vec_dot_type);
         const size_t nbw1 = wsp_ggml_row_size(vec_dot_type, ne10);
         const size_t nbw2 = nbw1*ne11;
         const size_t nbw3 = nbw2*ne12;
@@ -7701,19 +8050,32 @@ static void wsp_ggml_compute_forward_mul_mat_id(
         assert(params->wsize >= ne13*nbw3);
         WSP_GGML_ASSERT(src1->type == WSP_GGML_TYPE_F32);
+#if 0
         for (int64_t i13 = 0; i13 < ne13; ++i13) {
-            for (int64_t i12 = 0; i12 < ne12; ++i12) {
-                for (int64_t i11 = ith; i11 < ne11; i11 += nth) {
+            for (int64_t i12 = ith; i12 < ne12; i12 += nth) {
+                for (int64_t i11 = 0; i11 < ne11; ++i11) {
                     from_float((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11),
                                (void *)               (wdata + i13*nbw3 + i12*nbw2 + i11*nbw1),
                                ne10);
                 }
             }
         }
+#else
+        for (int64_t i13 = 0; i13 < ne13; ++i13) {
+            for (int64_t i12 = 0; i12 < ne12; ++i12) {
+                for (int64_t i11 = 0; i11 < ne11; ++i11) {
+                    size_t bs = wsp_ggml_blck_size(vec_dot_type);
+                    int64_t ne10_block_start = (ith * ne10/bs) / nth;
+                    int64_t ne10_block_end   = ((ith + 1) * ne10/bs) / nth;
+                    from_float((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11 + ne10_block_start*bs*nb10),
+                               (void *)               (wdata + i13*nbw3 + i12*nbw2 + i11*nbw1 + ne10_block_start*nbw0),
+                               (ne10_block_end - ne10_block_start) * bs);
+                }
+            }
+        }
+#endif
     }
-#define MMID_MATRIX_ROW(row_id, i1) matrix_rows[(row_id)*ne12 + (i1)]
     if (ith == 0) {
         // initialize matrix_row_counts
         memset(matrix_row_counts, 0, n_as*sizeof(int64_t));
@@ -7731,9 +8093,14 @@ static void wsp_ggml_compute_forward_mul_mat_id(
         }
     }
+    // reset current_chunk
+    for (int cur_a = ith; cur_a < n_as; cur_a += nth) {
+        atomic_int * current_chunk_ctr = (atomic_int *)(atomic_current_chunk + cur_a);
+        *current_chunk_ctr = nth;
+    }
     wsp_ggml_barrier(params->threadpool);
-    // compute each matrix multiplication in sequence
     for (int cur_a = 0; cur_a < n_as; ++cur_a) {
         const int64_t cne1 = matrix_row_counts[cur_a];
@@ -7741,112 +8108,64 @@ static void wsp_ggml_compute_forward_mul_mat_id(
             continue;
         }
-        const char * src0_cur = (const char *) src0->data + cur_a*nb02;
-        const void * wdata    = (src1->type == vec_dot_type) ? src1->data : params->wdata;
+        const char * src0_cur = (const char *) src0->data + cur_a * nb02;
+        const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
         const size_t row_size = wsp_ggml_row_size(vec_dot_type, ne10);
-        const int64_t nr0 = ne01; // src0 rows
-        const int64_t nr1 = cne1; // src1 rows
-        if (((wsp_ggml_n_dims(src0) - 1) == 2) && gemv) {
-            int64_t src0_cur_start = (ith * ne01) / nth;
-            int64_t src0_cur_end   = ((ith + 1) * ne01) / nth;
-            src0_cur_start = (src0_cur_start % matmul_num_cols) ? src0_cur_start + matmul_num_cols - (src0_cur_start % matmul_num_cols): src0_cur_start;
-            src0_cur_end   = (src0_cur_end % matmul_num_cols) ? src0_cur_end + matmul_num_cols - (src0_cur_end % matmul_num_cols): src0_cur_end;
-            if (src0_cur_start >= src0_cur_end) return;
-            for (int ir1 = 0; ir1 < nr1; ir1++) {
-                struct mmid_row_mapping row_mapping = MMID_MATRIX_ROW(cur_a, ir1);
-                const int id       = row_mapping.i1; // selected expert index
-                const int64_t  i11 = id % ne11;
-                const int64_t  i12 = row_mapping.i2; // row index in src1
-                const int64_t  i1 = id;  // selected expert index
-                const int64_t  i2 = i12; // row
-                const char * src1_col = (const char *) wdata +
-                    (src1_cont || src1->type != vec_dot_type
-                    ? (i11        + i12 * ne11) * row_size
-                    : (i11 * nb11 + i12 * nb12));
+        const int64_t nr0 = ne01;
+        const int64_t nr1 = cne1;
-                gemv(ne00, (float *)((char *) dst->data + (i1 * nb1 + i2 * nb2)) + src0_cur_start, ne01,
-                     (const char *) src0_cur + src0_cur_start * nb01, src1_col, 1, src0_cur_end - src0_cur_start);
-            }
-            continue;
+        int chunk_size = 16;
+        if (nr0 == 1 || nr1 == 1) {
+            chunk_size = 64;
         }
-        // distribute the thread work across the inner or outer loop based on which one is larger
-        const int64_t nth0 = nr0 > nr1 ? nth : 1; // parallelize by src0 rows
-        const int64_t nth1 = nr0 > nr1 ? 1 : nth; // parallelize by src1 rows
-        const int64_t ith0 = ith % nth0;
-        const int64_t ith1 = ith / nth0;
-        const int64_t dr0 = (nr0 + nth0 - 1)/nth0;
-        const int64_t dr1 = (nr1 + nth1 - 1)/nth1;
-        const int64_t ir010 = dr0*ith0;
-        const int64_t ir011 = MIN(ir010 + dr0, nr0);
-        const int64_t ir110 = dr1*ith1;
-        const int64_t ir111 = MIN(ir110 + dr1, nr1);
-        // threads with no work simply yield (not sure if it helps)
-        //if (ir010 >= ir011 || ir110 >= ir111) {
-        //    sched_yield();
-        //    continue;
-        //}
-        // block-tiling attempt
-        const int64_t blck_0 = 16;
-        const int64_t blck_1 = 16;
+#if defined(__aarch64__)
+        // disable for ARM
+        const bool disable_chunking = true;
+#else
+        // disable for NUMA
+        const bool disable_chunking = wsp_ggml_is_numa();
+#endif // defined(__aarch64__)
-        // attempt to reduce false-sharing (does not seem to make a difference)
-        float tmp[16];
+        int64_t nchunk0 = (nr0 + chunk_size - 1) / chunk_size;
+        int64_t nchunk1 = (nr1 + chunk_size - 1) / chunk_size;
-        for (int64_t iir1 = ir110; iir1 < ir111; iir1 += blck_1) {
-            for (int64_t iir0 = ir010; iir0 < ir011; iir0 += blck_0) {
-                for (int64_t ir1 = iir1; ir1 < iir1 + blck_1 && ir1 < ir111; ++ir1) {
-                    const int64_t _i12 = ir1; // logical row index for this expert
+        if (nchunk0 * nchunk1 < nth * 4 || disable_chunking) {
+            nchunk0 = nr0 > nr1 ? nth : 1;
+            nchunk1 = nr0 > nr1 ? 1 : nth;
+        }
-                    struct mmid_row_mapping row_mapping = MMID_MATRIX_ROW(cur_a, _i12);
-                    const int id       = row_mapping.i1; // selected expert index
+        const int64_t dr0 = (nr0 + nchunk0 - 1) / nchunk0;
+        const int64_t dr1 = (nr1 + nchunk1 - 1) / nchunk1;
-                    const int64_t  i11 = id % ne11;
-                    const int64_t  i12 = row_mapping.i2; // row index in src1
+        int current_chunk = ith;
-                    const int64_t  i1 = id;  // selected expert index
-                    const int64_t  i2 = i12; // row
+        atomic_int * current_chunk_ctr = (atomic_int *)(atomic_current_chunk + cur_a);
-                    // desc: when src1 is not a contiguous memory block we have to calculate the offset using the strides
-                    //       if it is, then we have either copied the data to params->wdata and made it contiguous or we are using
-                    //       the original src1 data pointer, so we should index using the indices directly
-                    // TODO: this is a bit of a hack, we should probably have a better way to handle this
-                    const char * src1_col = (const char *) wdata +
-                        (src1_cont || src1->type != vec_dot_type
-                        ? (i11      + i12*ne11)*row_size
-                        : (i11*nb11 + i12*nb12));
+        while (current_chunk < nchunk0 * nchunk1) {
+            const int64_t ith0 = current_chunk % nchunk0;
+            const int64_t ith1 = current_chunk / nchunk0;
-                    float * dst_col = (float *) ((char *) dst->data + (i1*nb1 + i2*nb2));
+            const int64_t ir0_start = dr0 * ith0;
+            const int64_t ir0_end = MIN(ir0_start + dr0, nr0);
-                    //for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir011; ++ir0) {
-                    //    vec_dot(ne00, &dst_col[ir0], src0_row + ir0*nb01, src1_col);
-                    //}
+            const int64_t ir1_start = dr1 * ith1;
+            const int64_t ir1_end = MIN(ir1_start + dr1, nr1);
-                    for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir011; ++ir0) {
-                        vec_dot(ne00, &tmp[ir0 - iir0], 0, src0_cur + ir0*nb01, 0, src1_col, 0, 1);
-                    }
+            wsp_ggml_compute_forward_mul_mat_id_one_chunk(
+                dst, src0, src1, ids, cur_a,
+                ir0_start, ir0_end, ir1_start, ir1_end,
+                src0_cur, matrix_rows, row_size, src1_cont, wdata
+            );
-                    memcpy(&dst_col[iir0], tmp, (MIN(iir0 + blck_0, ir011) - iir0)*sizeof(float));
-                }
+            if (nth >= nchunk0 * nchunk1) {
+                break;
             }
+            current_chunk = atomic_fetch_add_explicit(current_chunk_ctr, 1, memory_order_relaxed);
         }
     }
-#undef MMID_MATRIX_ROW
 }
 // wsp_ggml_compute_forward_out_prod
@@ -7867,12 +8186,13 @@ static void wsp_ggml_compute_forward_out_prod_f32(
     const int ith = params->ith;
     const int nth = params->nth;
-    WSP_GGML_ASSERT(ne0  == ne00);
-    WSP_GGML_ASSERT(ne1  == ne10);
-    WSP_GGML_ASSERT(ne2  == ne02);
-    WSP_GGML_ASSERT(ne02 == ne12);
-    WSP_GGML_ASSERT(ne3  == ne13);
-    WSP_GGML_ASSERT(ne03 == ne13);
+    WSP_GGML_ASSERT(ne0 == ne00);
+    WSP_GGML_ASSERT(ne1 == ne10);
+    WSP_GGML_ASSERT(ne2 == ne12);
+    WSP_GGML_ASSERT(ne3 == ne13);
+    WSP_GGML_ASSERT(ne2 % ne02 == 0);
+    WSP_GGML_ASSERT(ne3 % ne03 == 0);
     // we don't support permuted src0 or src1
     WSP_GGML_ASSERT(nb00 == sizeof(float));
@@ -7914,6 +8234,10 @@ static void wsp_ggml_compute_forward_out_prod_f32(
     const int64_t blck_0 = MAX(WSP_GGML_VEC_MAD_UNROLL, 32);
     const int64_t blck_1 = 16;
+    // dps == dst per src0, used for group query attention
+    const int64_t dps2 = ne2 / ne02;
+    const int64_t dps3 = ne3 / ne03;
     for (int64_t bir = ir0; bir < ir1; bir += blck_1) {
         const int64_t bir1 = MIN(bir + blck_1, ir1);
         for (int64_t bi01 = 0; bi01 < ne01; bi01 += blck_0) {
@@ -7924,8 +8248,8 @@ static void wsp_ggml_compute_forward_out_prod_f32(
                 const int64_t i2 = (ir - i3*ne2*ne1)/ne1;
                 const int64_t i1 = (ir - i3*ne2*ne1 - i2*ne1);
-                const int64_t i02 = i2;
-                const int64_t i03 = i3;
+                const int64_t i02 = i2 / dps2;
+                const int64_t i03 = i3 / dps3;
                 //const int64_t i10 = i1;
                 const int64_t i12 = i2;
@@ -7938,7 +8262,7 @@ static void wsp_ggml_compute_forward_out_prod_f32(
                     float * s0 = (float *) ((char *) src0->data + (          i01*nb01 + i02*nb02 + i03*nb03));
                     float * s1 = (float *) ((char *) src1->data + (i1*nb10 + i11*nb11 + i12*nb12 + i13*nb13));
-                    float * d  = (float *) ((char *)  dst->data + (          i1*nb1 + i2*nb2 + i3*nb3));
+                    float * d  = (float *) ((char *)  dst->data + (          i1*nb1   + i2*nb2   + i3*nb3));
                     wsp_ggml_vec_mad_f32_unroll(ne0, nb01, nb11, d, s0, s1);
                 }
@@ -7947,7 +8271,7 @@ static void wsp_ggml_compute_forward_out_prod_f32(
                     float * s0 = (float *) ((char *) src0->data + (          i01*nb01 + i02*nb02 + i03*nb03));
                     float * s1 = (float *) ((char *) src1->data + (i1*nb10 + i11*nb11 + i12*nb12 + i13*nb13));
-                    float * d  = (float *) ((char *)  dst->data + (          i1*nb1 + i2*nb2 + i3*nb3));
+                    float * d  = (float *) ((char *)  dst->data + (          i1*nb1   + i2*nb2   + i3*nb3));
                     wsp_ggml_vec_mad_f32(ne0, d, s0, *s1);
                 }
@@ -8084,9 +8408,6 @@ static void wsp_ggml_compute_forward_out_prod(
         case WSP_GGML_TYPE_IQ4_XS:
         case WSP_GGML_TYPE_IQ3_S:
         case WSP_GGML_TYPE_IQ2_S:
-        case WSP_GGML_TYPE_Q4_0_4_4:
-        case WSP_GGML_TYPE_Q4_0_4_8:
-        case WSP_GGML_TYPE_Q4_0_8_8:
             {
                 wsp_ggml_compute_forward_out_prod_q_f32(params, dst);
             } break;
@@ -8239,6 +8560,77 @@ static void wsp_ggml_compute_forward_set_f32(
     }
 }
+static void wsp_ggml_compute_forward_set_i32(
+        const struct wsp_ggml_compute_params * params,
+        struct wsp_ggml_tensor * dst) {
+    const struct wsp_ggml_tensor * src0 = dst->src[0];
+    const struct wsp_ggml_tensor * src1 = dst->src[1];
+    WSP_GGML_ASSERT(wsp_ggml_are_same_shape(src0, dst));
+    WSP_GGML_ASSERT(wsp_ggml_is_contiguous(dst) && wsp_ggml_is_contiguous(src0));
+    // view src0 and dst with these strides and data offset inbytes during set
+    // nb0 is implicitly element_size because src0 and dst are contiguous
+    size_t nb1     = ((int32_t *) dst->op_params)[0];
+    size_t nb2     = ((int32_t *) dst->op_params)[1];
+    size_t nb3     = ((int32_t *) dst->op_params)[2];
+    size_t offset  = ((int32_t *) dst->op_params)[3];
+    bool   inplace = (bool) ((int32_t *) dst->op_params)[4];
+    if (!inplace) {
+        if (params->ith == 0) {
+            // memcpy needs to be synchronized across threads to avoid race conditions.
+            // => do it in INIT phase
+            memcpy(
+                ((char *)  dst->data),
+                ((char *) src0->data),
+                wsp_ggml_nbytes(dst));
+        }
+        wsp_ggml_barrier(params->threadpool);
+    }
+    const int ith = params->ith;
+    const int nth = params->nth;
+    const int nr = wsp_ggml_nrows(src1);
+    const int nc = src1->ne[0];
+    WSP_GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne)
+    WSP_GGML_TENSOR_LOCALS(size_t,  nb1, src1, nb)
+    // src0 and dst as viewed during set
+    const size_t nb0 = wsp_ggml_element_size(src0);
+    const int im0 = (ne10 == 0 ? 0 : ne10-1);
+    const int im1 = (ne11 == 0 ? 0 : ne11-1);
+    const int im2 = (ne12 == 0 ? 0 : ne12-1);
+    const int im3 = (ne13 == 0 ? 0 : ne13-1);
+    WSP_GGML_ASSERT(offset + im0*nb0  + im1*nb1  + im2*nb2  + im3*nb3  <= wsp_ggml_nbytes(dst));
+    WSP_GGML_ASSERT(nb10 == sizeof(int32_t));
+    // rows per thread
+    const int dr = (nr + nth - 1)/nth;
+    // row range for this thread
+    const int ir0 = dr*ith;
+    const int ir1 = MIN(ir0 + dr, nr);
+    for (int ir = ir0; ir < ir1; ++ir) {
+        // src0 and dst are viewed with shape of src1 and offset
+        // => same indices
+        const int i3 = ir/(ne12*ne11);
+        const int i2 = (ir - i3*ne12*ne11)/ne11;
+        const int i1 = (ir - i3*ne12*ne11 - i2*ne11);
+        wsp_ggml_vec_cpy_i32(nc,
+                (int32_t *) ((char *)  dst->data + i3*nb3  + i2*nb2  + i1*nb1  + offset),
+                (int32_t *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11));
+    }
+}
 static void wsp_ggml_compute_forward_set(
         const struct wsp_ggml_compute_params * params,
         struct wsp_ggml_tensor * dst) {
@@ -8250,6 +8642,10 @@ static void wsp_ggml_compute_forward_set(
             {
                 wsp_ggml_compute_forward_set_f32(params, dst);
             } break;
+        case WSP_GGML_TYPE_I32:
+            {
+                wsp_ggml_compute_forward_set_i32(params, dst);
+            } break;
         case WSP_GGML_TYPE_F16:
         case WSP_GGML_TYPE_BF16:
         case WSP_GGML_TYPE_Q4_0:
@@ -8274,9 +8670,6 @@ static void wsp_ggml_compute_forward_set(
         case WSP_GGML_TYPE_IQ4_XS:
         case WSP_GGML_TYPE_IQ3_S:
         case WSP_GGML_TYPE_IQ2_S:
-        case WSP_GGML_TYPE_Q4_0_4_4:
-        case WSP_GGML_TYPE_Q4_0_4_8:
-        case WSP_GGML_TYPE_Q4_0_8_8:
         default:
             {
                 WSP_GGML_ABORT("fatal error");
@@ -8538,9 +8931,6 @@ static void wsp_ggml_compute_forward_get_rows(
         case WSP_GGML_TYPE_IQ4_XS:
         case WSP_GGML_TYPE_IQ3_S:
         case WSP_GGML_TYPE_IQ2_S:
-        case WSP_GGML_TYPE_Q4_0_4_4:
-        case WSP_GGML_TYPE_Q4_0_4_8:
-        case WSP_GGML_TYPE_Q4_0_8_8:
             {
                 wsp_ggml_compute_forward_get_rows_q(params, dst);
             } break;
@@ -8957,9 +9347,9 @@ static void wsp_ggml_compute_forward_soft_max(
 }
-// wsp_ggml_compute_forward_soft_max_back
+// wsp_ggml_compute_forward_soft_max_ext_back
-static void wsp_ggml_compute_forward_soft_max_back_f32(
+static void wsp_ggml_compute_forward_soft_max_ext_back_f32(
         const struct wsp_ggml_compute_params * params,
         struct wsp_ggml_tensor * dst) {
@@ -8972,6 +9362,14 @@ static void wsp_ggml_compute_forward_soft_max_back_f32(
     WSP_GGML_ASSERT(wsp_ggml_are_same_shape(src0, dst));
     WSP_GGML_ASSERT(wsp_ggml_are_same_shape(src1, dst));
+    float scale    = 1.0f;
+    float max_bias = 0.0f;
+    memcpy(&scale,    (const float *) dst->op_params + 0, sizeof(float));
+    memcpy(&max_bias, (const float *) dst->op_params + 1, sizeof(float));
+    WSP_GGML_ASSERT(max_bias == 0.0f);
     // TODO: handle transposed/permuted matrices
     const int ith = params->ith;
@@ -9020,10 +9418,11 @@ static void wsp_ggml_compute_forward_soft_max_back_f32(
         // linear runtime, no additional memory
         float dot_y_dy = 0;
-        wsp_ggml_vec_dot_f32 (nc, &dot_y_dy, 0, y, 0, dy, 0, 1);
-        wsp_ggml_vec_cpy_f32 (nc, dx, dy);
-        wsp_ggml_vec_acc1_f32(nc, dx, -dot_y_dy);
-        wsp_ggml_vec_mul_f32 (nc, dx, dx, y);
+        wsp_ggml_vec_dot_f32  (nc, &dot_y_dy, 0, y, 0, dy, 0, 1);
+        wsp_ggml_vec_cpy_f32  (nc, dx, dy);
+        wsp_ggml_vec_acc1_f32 (nc, dx, -dot_y_dy);
+        wsp_ggml_vec_mul_f32  (nc, dx, dx, y);
+        wsp_ggml_vec_scale_f32(nc, dx, scale);
 #ifndef NDEBUG
         for (int i = 0; i < nc; ++i) {
@@ -9034,7 +9433,7 @@ static void wsp_ggml_compute_forward_soft_max_back_f32(
     }
 }
-static void wsp_ggml_compute_forward_soft_max_back(
+static void wsp_ggml_compute_forward_soft_max_ext_back(
         const struct wsp_ggml_compute_params * params,
         struct wsp_ggml_tensor * dst) {
@@ -9043,7 +9442,7 @@ static void wsp_ggml_compute_forward_soft_max_back(
     switch (src0->type) {
         case WSP_GGML_TYPE_F32:
             {
-                wsp_ggml_compute_forward_soft_max_back_f32(params, dst);
+                wsp_ggml_compute_forward_soft_max_ext_back_f32(params, dst);
             } break;
         default:
             {
@@ -9060,10 +9459,6 @@ static void wsp_ggml_compute_forward_clamp_f32(
     const struct wsp_ggml_tensor * src0 = dst->src[0];
-    if (params->ith != 0) {
-        return;
-    }
     float min;
     float max;
     memcpy(&min, (float *) dst->op_params + 0, sizeof(float));
@@ -9130,9 +9525,6 @@ static void wsp_ggml_compute_forward_clamp(
         case WSP_GGML_TYPE_IQ3_S:
         case WSP_GGML_TYPE_IQ2_S:
         case WSP_GGML_TYPE_Q8_K:
-        case WSP_GGML_TYPE_Q4_0_4_4:
-        case WSP_GGML_TYPE_Q4_0_4_8:
-        case WSP_GGML_TYPE_Q4_0_8_8:
         case WSP_GGML_TYPE_I8:
         case WSP_GGML_TYPE_I16:
         case WSP_GGML_TYPE_I32:
@@ -9187,6 +9579,64 @@ static void wsp_ggml_rope_cache_init(
     }
 }
+static void wsp_ggml_mrope_cache_init(
+     float theta_base_t, float theta_base_h, float theta_base_w, float theta_base_e, int sections[4], bool indep_sects,
+     float freq_scale, const float * freq_factors, float corr_dims[2], int64_t ne0, float ext_factor, float mscale,
+     float * cache, float sin_sign, float theta_scale) {
+    // ref: https://github.com/jquesnelle/yarn/blob/master/scaled_rope/LlamaYaRNScaledRotaryEmbedding.py
+    float theta_t = theta_base_t;
+    float theta_h = theta_base_h;
+    float theta_w = theta_base_w;
+    float theta_e = theta_base_e;  // extra position id for vision encoder
+    int sect_dims = sections[0] + sections[1] + sections[2] + sections[3];
+    int sec_w = sections[1] + sections[0];
+    int sec_e = sections[2] + sec_w;
+    WSP_GGML_ASSERT(sect_dims <= ne0);
+    for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
+        const float ff = freq_factors ? freq_factors[i0/2] : 1.0f;
+        int sector = (i0 / 2) % sect_dims;
+        if (indep_sects) {
+            // compute theta independently for each dim sections
+            // (i.e. reset corresponding theta when `i0` go from one section to another)
+            if (sector == 0) {
+                theta_t = theta_base_t;
+            }
+            else if (sector == sections[0]) {
+                theta_h = theta_base_h;;
+            }
+            else if (sector == sec_w) {
+                theta_w = theta_base_w;
+            }
+            else if (sector == sec_e) {
+                theta_e = theta_base_e;
+            }
+        }
+        float theta = theta_t;
+        if (sector >= sections[0] && sector < sec_w) {
+            theta = theta_h;
+        }
+        else if (sector >= sec_w && sector < sec_w + sections[2]) {
+            theta = theta_w;
+        }
+        else if (sector >= sec_w + sections[2]) {
+            theta = theta_e;
+        }
+        rope_yarn(
+            theta/ff, freq_scale, corr_dims, i0, ext_factor, mscale, &cache[i0 + 0], &cache[i0 + 1]
+        );
+        cache[i0 + 1] *= sin_sign;
+        theta_t *= theta_scale;
+        theta_w *= theta_scale;
+        theta_h *= theta_scale;
+        theta_e *= theta_scale;
+    }
+}
 static void wsp_ggml_compute_forward_rope_f32(
         const struct wsp_ggml_compute_params * params,
         struct wsp_ggml_tensor * dst,
@@ -9197,6 +9647,7 @@ static void wsp_ggml_compute_forward_rope_f32(
     const struct wsp_ggml_tensor * src2 = dst->src[2];
     float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
+    int sections[4];
     //const int n_past     = ((int32_t *) dst->op_params)[0];
     const int n_dims     = ((int32_t *) dst->op_params)[1];
@@ -9210,6 +9661,7 @@ static void wsp_ggml_compute_forward_rope_f32(
     memcpy(&attn_factor, (int32_t *) dst->op_params +  8, sizeof(float));
     memcpy(&beta_fast,   (int32_t *) dst->op_params +  9, sizeof(float));
     memcpy(&beta_slow,   (int32_t *) dst->op_params + 10, sizeof(float));
+    memcpy(&sections,    (int32_t *) dst->op_params + 11, sizeof(int)*4);
     WSP_GGML_TENSOR_UNARY_OP_LOCALS
@@ -9242,6 +9694,16 @@ static void wsp_ggml_compute_forward_rope_f32(
     wsp_ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow, corr_dims);
     const bool is_neox = mode & WSP_GGML_ROPE_TYPE_NEOX;
+    const bool is_mrope = mode & WSP_GGML_ROPE_TYPE_MROPE;  // wsp_ggml_rope_multi, multimodal rotary position embedding
+    const bool is_vision = mode == WSP_GGML_ROPE_TYPE_VISION;
+    if (is_mrope) {
+        WSP_GGML_ASSERT(sections[0] > 0 || sections[1] > 0 || sections[2] > 0);
+    }
+    if (is_vision) {
+        WSP_GGML_ASSERT(n_dims == ne0/2);
+    }
     const float * freq_factors = NULL;
     if (src2 != NULL) {
@@ -9257,18 +9719,63 @@ static void wsp_ggml_compute_forward_rope_f32(
     const int32_t * pos = (const int32_t *) src1->data;
-    for (int64_t i3 = 0; i3 < ne3; i3++) {
-        for (int64_t i2 = 0; i2 < ne2; i2++) {
-            const int64_t p = pos[i2];
+    for (int64_t i3 = 0; i3 < ne3; i3++) { // batch
+        for (int64_t i2 = 0; i2 < ne2; i2++) { // seq-len
             float * cache = (float *) params->wdata + (ne0 + CACHE_LINE_SIZE_F32)*ith;
-            wsp_ggml_rope_cache_init(p, freq_scale, freq_factors, corr_dims, ne0, ext_factor, attn_factor, cache, sin_sign, theta_scale);
+            if (!is_mrope) {
+                const int64_t p = pos[i2];
+                wsp_ggml_rope_cache_init(p, freq_scale, freq_factors, corr_dims, ne0, ext_factor, attn_factor, cache, sin_sign, theta_scale);
+            }
+            else {
+                const int64_t p_t = pos[i2];
+                const int64_t p_h = pos[i2 + ne2];
+                const int64_t p_w = pos[i2 + ne2 * 2];
+                const int64_t p_e = pos[i2 + ne2 * 3];
+                wsp_ggml_mrope_cache_init(
+                    p_t, p_h, p_w, p_e, sections, is_vision,
+                    freq_scale, freq_factors, corr_dims, ne0, ext_factor, attn_factor, cache, sin_sign, theta_scale);
+            }
-            for (int64_t i1 = 0; i1 < ne1; i1++) {
+            for (int64_t i1 = 0; i1 < ne1; i1++) { // attn-heads
                 if (ir++ < ir0) continue;
                 if (ir   > ir1) break;
-                if (!is_neox) {
+                if (is_neox || is_mrope) {
+                    if (is_vision){
+                        for (int64_t i0 = 0; i0 < n_dims; i0 += 2) {
+                            const int64_t ic = i0/2;
+                            const float cos_theta = cache[i0 + 0];
+                            const float sin_theta = cache[i0 + 1];
+                            const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + ic*nb00);
+                            float * dst_data  = (float *)((char *)  dst->data + i3*nb3  + i2*nb2  + i1*nb1  + ic*nb0);
+                            const float x0 = src[0];
+                            const float x1 = src[n_dims];
+                            dst_data[0]      = x0*cos_theta - x1*sin_theta;
+                            dst_data[n_dims] = x0*sin_theta + x1*cos_theta;
+                        }
+                    } else {
+                        for (int64_t i0 = 0; i0 < n_dims; i0 += 2) {
+                            const int64_t ic = i0/2;
+                            const float cos_theta = cache[i0 + 0];
+                            const float sin_theta = cache[i0 + 1];
+                            const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + ic*nb00);
+                            float * dst_data  = (float *)((char *)  dst->data + i3*nb3  + i2*nb2  + i1*nb1  + ic*nb0);
+                            const float x0 = src[0];
+                            const float x1 = src[n_dims/2];
+                            dst_data[0]        = x0*cos_theta - x1*sin_theta;
+                            dst_data[n_dims/2] = x0*sin_theta + x1*cos_theta;
+                        }
+                    }
+                } else {
                     for (int64_t i0 = 0; i0 < n_dims; i0 += 2) {
                         const float cos_theta = cache[i0 + 0];
                         const float sin_theta = cache[i0 + 1];
@@ -9282,8 +9789,10 @@ static void wsp_ggml_compute_forward_rope_f32(
                         dst_data[0] = x0*cos_theta - x1*sin_theta;
                         dst_data[1] = x0*sin_theta + x1*cos_theta;
                     }
-                } else {
-                    for (int64_t i0 = 0; i0 < n_dims; i0 += 2) {
+                }
+                if (is_vision) {
+                    for (int64_t i0 = n_dims; i0 < ne0; i0 += 2) {
                         const int64_t ic = i0/2;
                         const float cos_theta = cache[i0 + 0];
@@ -9293,19 +9802,20 @@ static void wsp_ggml_compute_forward_rope_f32(
                         float * dst_data  = (float *)((char *)  dst->data + i3*nb3  + i2*nb2  + i1*nb1  + ic*nb0);
                         const float x0 = src[0];
-                        const float x1 = src[n_dims/2];
+                        const float x1 = src[n_dims];
-                        dst_data[0]        = x0*cos_theta - x1*sin_theta;
-                        dst_data[n_dims/2] = x0*sin_theta + x1*cos_theta;
+                        dst_data[0]      = x0*cos_theta - x1*sin_theta;
+                        dst_data[n_dims] = x0*sin_theta + x1*cos_theta;
                     }
-                }
-                for (int64_t i0 = n_dims; i0 < ne0; i0 += 2) {
-                    const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
-                    float * dst_data  = (float *)((char *)  dst->data + i3*nb3  + i2*nb2  + i1*nb1  + i0*nb0);
+                } else {
+                    // fill the remain channels with data from src tensor
+                    for (int64_t i0 = n_dims; i0 < ne0; i0 += 2) {
+                        const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
+                        float * dst_data  = (float *)((char *)  dst->data + i3*nb3  + i2*nb2  + i1*nb1  + i0*nb0);
-                    dst_data[0] = src[0];
-                    dst_data[1] = src[1];
+                        dst_data[0] = src[0];
+                        dst_data[1] = src[1];
+                    }
                 }
             }
         }
@@ -9323,6 +9833,7 @@ static void wsp_ggml_compute_forward_rope_f16(
     const struct wsp_ggml_tensor * src2 = dst->src[2];
     float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
+    int sections[4];
     //const int n_past     = ((int32_t *) dst->op_params)[0];
     const int n_dims     = ((int32_t *) dst->op_params)[1];
@@ -9335,6 +9846,8 @@ static void wsp_ggml_compute_forward_rope_f16(
     memcpy(&attn_factor, (int32_t *) dst->op_params +  8, sizeof(float));
     memcpy(&beta_fast,   (int32_t *) dst->op_params +  9, sizeof(float));
     memcpy(&beta_slow,   (int32_t *) dst->op_params + 10, sizeof(float));
+    memcpy(&sections,    (int32_t *) dst->op_params + 11, sizeof(int)*4);
     WSP_GGML_TENSOR_UNARY_OP_LOCALS
@@ -9367,6 +9880,16 @@ static void wsp_ggml_compute_forward_rope_f16(
     wsp_ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow, corr_dims);
     const bool is_neox = mode & WSP_GGML_ROPE_TYPE_NEOX;
+    const bool is_mrope = mode & WSP_GGML_ROPE_TYPE_MROPE;
+    const bool is_vision = mode == WSP_GGML_ROPE_TYPE_VISION;
+    if (is_mrope) {
+        WSP_GGML_ASSERT(sections[0] > 0 || sections[1] > 0 || sections[2] > 0);
+    }
+    if (is_vision) {
+        WSP_GGML_ASSERT(n_dims == ne0/2);
+    }
     const float * freq_factors = NULL;
     if (src2 != NULL) {
@@ -9384,16 +9907,61 @@ static void wsp_ggml_compute_forward_rope_f16(
     for (int64_t i3 = 0; i3 < ne3; i3++) {
         for (int64_t i2 = 0; i2 < ne2; i2++) {
-            const int64_t p = pos[i2];
             float * cache = (float *) params->wdata + (ne0 + CACHE_LINE_SIZE_F32)*ith;
-            wsp_ggml_rope_cache_init(p, freq_scale, freq_factors, corr_dims, ne0, ext_factor, attn_factor, cache, sin_sign, theta_scale);
+            if (!is_mrope) {
+                const int64_t p = pos[i2];
+                wsp_ggml_rope_cache_init(p, freq_scale, freq_factors, corr_dims, ne0, ext_factor, attn_factor, cache, sin_sign, theta_scale);
+            }
+            else {
+                const int64_t p_t = pos[i2];
+                const int64_t p_h = pos[i2 + ne2];
+                const int64_t p_w = pos[i2 + ne2 * 2];
+                const int64_t p_e = pos[i2 + ne2 * 3];
+                wsp_ggml_mrope_cache_init(
+                    p_t, p_h, p_w, p_e, sections, is_vision,
+                    freq_scale, freq_factors, corr_dims, ne0, ext_factor, attn_factor, cache, sin_sign, theta_scale);
+            }
             for (int64_t i1 = 0; i1 < ne1; i1++) {
                 if (ir++ < ir0) continue;
                 if (ir   > ir1) break;
-                if (!is_neox) {
+                if (is_neox || is_mrope) {
+                    if (is_vision) {
+                        for (int64_t i0 = 0; i0 < n_dims; i0 += 2) {
+                            const int64_t ic = i0/2;
+                            const float cos_theta = cache[i0 + 0];
+                            const float sin_theta = cache[i0 + 1];
+                            const wsp_ggml_fp16_t * const src = (wsp_ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + ic*nb00);
+                            wsp_ggml_fp16_t * dst_data  = (wsp_ggml_fp16_t *)((char *)  dst->data + i3*nb3  + i2*nb2  + i1*nb1  + ic*nb0);
+                            const float x0 = WSP_GGML_FP16_TO_FP32(src[0]);
+                            const float x1 = WSP_GGML_FP16_TO_FP32(src[n_dims]);
+                            dst_data[0]      = WSP_GGML_FP32_TO_FP16(x0*cos_theta - x1*sin_theta);
+                            dst_data[n_dims] = WSP_GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
+                        }
+                    } else {
+                        for (int64_t i0 = 0; i0 < n_dims; i0 += 2) {
+                            const int64_t ic = i0/2;
+                            const float cos_theta = cache[i0 + 0];
+                            const float sin_theta = cache[i0 + 1];
+                            const wsp_ggml_fp16_t * const src = (wsp_ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + ic*nb00);
+                            wsp_ggml_fp16_t * dst_data  = (wsp_ggml_fp16_t *)((char *)  dst->data + i3*nb3  + i2*nb2  + i1*nb1  + ic*nb0);
+                            const float x0 = WSP_GGML_FP16_TO_FP32(src[0]);
+                            const float x1 = WSP_GGML_FP16_TO_FP32(src[n_dims/2]);
+                            dst_data[0]        = WSP_GGML_FP32_TO_FP16(x0*cos_theta - x1*sin_theta);
+                            dst_data[n_dims/2] = WSP_GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
+                        }
+                    }
+                } else {
                     for (int64_t i0 = 0; i0 < n_dims; i0 += 2) {
                         const float cos_theta = cache[i0 + 0];
                         const float sin_theta = cache[i0 + 1];
@@ -9407,8 +9975,10 @@ static void wsp_ggml_compute_forward_rope_f16(
                         dst_data[0] = WSP_GGML_FP32_TO_FP16(x0*cos_theta - x1*sin_theta);
                         dst_data[1] = WSP_GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
                     }
-                } else {
-                    for (int64_t i0 = 0; i0 < n_dims; i0 += 2) {
+                }
+                if (is_vision) {
+                    for (int64_t i0 = n_dims; i0 < ne0; i0 += 2) {
                         const int64_t ic = i0/2;
                         const float cos_theta = cache[i0 + 0];
@@ -9418,19 +9988,19 @@ static void wsp_ggml_compute_forward_rope_f16(
                         wsp_ggml_fp16_t * dst_data  = (wsp_ggml_fp16_t *)((char *)  dst->data + i3*nb3  + i2*nb2  + i1*nb1  + ic*nb0);
                         const float x0 = WSP_GGML_FP16_TO_FP32(src[0]);
-                        const float x1 = WSP_GGML_FP16_TO_FP32(src[n_dims/2]);
+                        const float x1 = WSP_GGML_FP16_TO_FP32(src[n_dims]);
-                        dst_data[0]        = WSP_GGML_FP32_TO_FP16(x0*cos_theta - x1*sin_theta);
-                        dst_data[n_dims/2] = WSP_GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
+                        dst_data[0]      = WSP_GGML_FP32_TO_FP16(x0*cos_theta - x1*sin_theta);
+                        dst_data[n_dims] = WSP_GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
                     }
-                }
-                for (int64_t i0 = n_dims; i0 < ne0; i0 += 2) {
-                    const wsp_ggml_fp16_t * const src = (wsp_ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
-                    wsp_ggml_fp16_t * dst_data  = (wsp_ggml_fp16_t *)((char *)  dst->data + i3*nb3  + i2*nb2  + i1*nb1  + i0*nb0);
+                } else {
+                    for (int64_t i0 = n_dims; i0 < ne0; i0 += 2) {
+                        const wsp_ggml_fp16_t * const src = (wsp_ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
+                        wsp_ggml_fp16_t * dst_data  = (wsp_ggml_fp16_t *)((char *)  dst->data + i3*nb3  + i2*nb2  + i1*nb1  + i0*nb0);
-                    dst_data[0] = src[0];
-                    dst_data[1] = src[1];
+                        dst_data[0] = src[0];
+                        dst_data[1] = src[1];
+                    }
                 }
             }
         }
@@ -9861,9 +10431,10 @@ static void wsp_ggml_compute_forward_im2col_back_f32(
         const struct wsp_ggml_compute_params * params,
               struct wsp_ggml_tensor * dst) {
-    const struct wsp_ggml_tensor * src0 = dst->src[0];
-    const struct wsp_ggml_tensor * src1 = dst->src[1];
+    const struct wsp_ggml_tensor * src0 = dst->src[0]; // gradients of forward pass output
+    const struct wsp_ggml_tensor * src1 = dst->src[1]; // convolution kernel
+    WSP_GGML_ASSERT(src0->type == WSP_GGML_TYPE_F32);
     WSP_GGML_ASSERT(src1->type == WSP_GGML_TYPE_F32);
     WSP_GGML_ASSERT( dst->type == WSP_GGML_TYPE_F32);
@@ -9885,11 +10456,11 @@ static void wsp_ggml_compute_forward_im2col_back_f32(
     const int64_t IH = is_2D ? ne1 : 1;
     const int64_t IW = ne0;
-    const int64_t KH = is_2D ? ne01 : 1;
-    const int64_t KW = ne00;
+    const int64_t KH = is_2D ? ne11 : 1;
+    const int64_t KW = ne10;
-    const int64_t OH = is_2D ? ne12 : 1;
-    const int64_t OW = ne11;
+    const int64_t OH = is_2D ? ne02 : 1;
+    const int64_t OW = ne01;
     int ofs0 = is_2D ? nb3 : nb2;
     int ofs1 = is_2D ? nb2 : nb1;
@@ -9935,9 +10506,9 @@ static void wsp_ggml_compute_forward_im2col_back_f32(
                                     continue;
                                 }
-                                const float * const src_data = (const float *) src1->data
+                                const float * const grad_in = (const float *) src0->data
                                     + (in*OH*OW + ioh*OW + iow)*(IC*KH*KW); // [IC, KH, KW]
-                                grad += src_data[iic*(KH*KW) + ikh*KW + ikw];
+                                grad += grad_in[iic*(KH*KW) + ikh*KW + ikw];
                             }
                         }
                         float * dst_data = (float *)((char *) wdata + (in*ofs0 + iic*ofs1)); // [IH, IW]
@@ -10429,6 +11000,40 @@ static void wsp_ggml_compute_forward_pad(
     }
 }
+// wsp_ggml_compute_forward_pad_reflect_1d
+static void wsp_ggml_compute_forward_pad_reflect_1d(
+        const struct wsp_ggml_compute_params * params,
+              struct wsp_ggml_tensor * dst) {
+    const struct wsp_ggml_tensor * src0 = dst->src[0];
+    WSP_GGML_ASSERT(src0->type == WSP_GGML_TYPE_F32);
+    WSP_GGML_ASSERT( dst->type == WSP_GGML_TYPE_F32);
+    const int ith = params->ith;
+    const int nth = params->nth;
+    const int32_t * opts = (const int32_t *) dst->op_params;
+    const int p0 = opts[0];
+    const int p1 = opts[1];
+    WSP_GGML_TENSOR_UNARY_OP_LOCALS
+    for (int64_t i3 = 0; i3 < ne3; i3++) {
+        for (int64_t i2 = 0; i2 < ne2; i2++) {
+            for (int64_t i1 = ith; i1 < ne1; i1 += nth) {
+                float * left  = (float *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 +         p0*nb0);
+                float * right = (float *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + (ne0-p1-1)*nb0);
+                wsp_ggml_vec_cpy_f32(ne00, left, (float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01));
+                for (int i0 = 1; i0 <= p0; i0++) { left[-i0] = left[i0];   }
+                for (int i0 = 1; i0 <= p1; i0++) { right[i0] = right[-i0]; }
+            }
+        }
+    }
+}
 // wsp_ggml_compute_forward_arange
@@ -11645,9 +12250,9 @@ static void wsp_ggml_compute_forward_add_rel_pos(
 static void wsp_ggml_compute_forward_rwkv_wkv6_f32(
         const struct wsp_ggml_compute_params * params,
         struct wsp_ggml_tensor * dst) {
-    const int64_t T = dst->src[1]->ne[3];
+    const int64_t T = dst->src[1]->ne[2];
     const int64_t C = dst->ne[0];
-    const int64_t HEADS = dst->src[1]->ne[2];
+    const int64_t HEADS = dst->src[1]->ne[1];
     const int64_t n_seqs = dst->src[5]->ne[1];
     const int64_t head_size = C / HEADS;
@@ -11842,6 +12447,197 @@ static void wsp_ggml_compute_forward_rwkv_wkv6(
     }
 }
+// wsp_ggml_compute_forward_gla
+static void wsp_ggml_compute_forward_gla_f32(
+        const struct wsp_ggml_compute_params * params,
+        struct wsp_ggml_tensor * dst) {
+    const int64_t T = dst->src[1]->ne[2];
+    const int64_t C = dst->ne[0];
+    const int64_t HEADS = dst->src[1]->ne[1];
+    const int64_t n_seqs = dst->src[4]->ne[1];
+    const int64_t head_size = C / HEADS;
+    const float scale = wsp_ggml_get_op_params_f32(dst, 0);
+    float * dst_data = (float *) dst->data;
+    float * state = ((float *) dst->data) + C * T;
+    const int ith = params->ith;
+    const int nth = params->nth;
+    if (ith >= HEADS) {
+        return;
+    }
+    const int h_start = (HEADS * ith) / nth;
+    const int h_end = ((HEADS * (ith + 1)) / nth < HEADS) ?
+                (HEADS * (ith + 1)) / nth : HEADS;
+    float * k = (float *) dst->src[0]->data;
+    float * v = (float *) dst->src[1]->data;
+    float * q = (float *) dst->src[2]->data;
+    float * g = (float *) dst->src[3]->data;
+    size_t t_stride = HEADS * head_size; // Same to C
+    size_t h_stride = C / HEADS;
+    WSP_GGML_ASSERT(C % HEADS == 0); // C must be divisible by HEADS
+    size_t h_stride_2d = head_size * head_size;
+    if (ith == 0) {
+        memset(dst_data, 0, T * C * sizeof(float));
+    }
+    wsp_ggml_barrier(params->threadpool);
+    #if defined(__AVX__) && !defined(__AVX512F__)
+        #define WSP_GGML_F32X WSP_GGML_F32x8
+        #define WSP_GGML_F32X_SET1 WSP_GGML_F32x8_SET1
+        #define WSP_GGML_F32X_LOAD WSP_GGML_F32x8_LOAD
+        #define WSP_GGML_F32X_STORE WSP_GGML_F32x8_STORE
+        #define WSP_GGML_F32X_MUL WSP_GGML_F32x8_MUL
+        #define WSP_GGML_F32X_FMA WSP_GGML_F32x8_FMA
+        #define GLA_VECTOR_SIZE 8
+    #elif defined(__AVX512F__)
+        #define WSP_GGML_F32X WSP_GGML_F32x16
+        #define WSP_GGML_F32X_SET1 WSP_GGML_F32x16_SET1
+        #define WSP_GGML_F32X_LOAD WSP_GGML_F32x16_LOAD
+        #define WSP_GGML_F32X_STORE WSP_GGML_F32x16_STORE
+        #define WSP_GGML_F32X_MUL WSP_GGML_F32x16_MUL
+        #define WSP_GGML_F32X_FMA WSP_GGML_F32x16_FMA
+        #define GLA_VECTOR_SIZE 16
+    #elif defined(__ARM_NEON) && defined(__aarch64__)
+        #define WSP_GGML_F32X WSP_GGML_F32x4
+        #define WSP_GGML_F32X_SET1 WSP_GGML_F32x4_SET1
+        #define WSP_GGML_F32X_LOAD WSP_GGML_F32x4_LOAD
+        #define WSP_GGML_F32X_STORE WSP_GGML_F32x4_STORE
+        #define WSP_GGML_F32X_MUL WSP_GGML_F32x4_MUL
+        #define WSP_GGML_F32X_FMA WSP_GGML_F32x4_FMA
+        #define GLA_VECTOR_SIZE 4
+    #endif
+    #ifdef GLA_VECTOR_SIZE
+        const int64_t vec_count = head_size / GLA_VECTOR_SIZE;
+        for (int64_t t = 0; t < T; t++) {
+            size_t t_offset = t * t_stride;
+            size_t state_offset = head_size * C * (t / (T / n_seqs));
+            float * state_cur = state + state_offset;
+            float * state_prev = t % (T / n_seqs) ? state_cur : (float*)dst->src[4]->data + state_offset;
+            for (int64_t h = h_start; h < h_end; h++) {
+                size_t h_offset = h * h_stride;
+                size_t t_h_offset = t_offset + h_offset;
+                size_t h_2d_offset = h * h_stride_2d;
+                for (int64_t i = 0; i < head_size; i++) {
+                    size_t t_h_i_offset = t_h_offset + i;
+                    size_t h_2d_i_offset = h_2d_offset + i * h_stride;
+                    float k_val = k[t_h_i_offset];
+                    float q_val = q[t_h_i_offset] * scale;
+                    float g_val = g[t_h_i_offset];
+                    // Broadcast scalar values to vectors
+                    WSP_GGML_F32X k_vec = WSP_GGML_F32X_SET1(k_val);
+                    WSP_GGML_F32X q_vec = WSP_GGML_F32X_SET1(q_val);
+                    WSP_GGML_F32X g_vec = WSP_GGML_F32X_SET1(g_val);
+                    for (int64_t j = 0; j < vec_count; j++) {
+                        size_t base_j = j * GLA_VECTOR_SIZE;
+                        size_t t_h_j_offset = t_h_offset + base_j;
+                        size_t h_2d_i_j_offset = h_2d_i_offset + base_j;
+                        // Load x elements at once
+                        WSP_GGML_F32X v_vec = WSP_GGML_F32X_LOAD(&v[t_h_j_offset]);
+                        WSP_GGML_F32X prev_state_vec = WSP_GGML_F32X_LOAD(&state_prev[h_2d_i_j_offset]);
+                        WSP_GGML_F32X dst_vec = WSP_GGML_F32X_LOAD(&dst_data[t_h_j_offset]);
+                        // Compute kv = v * k
+                        WSP_GGML_F32X kv_vec = WSP_GGML_F32X_MUL(v_vec, k_vec);
+                        // Compute temp = prev_state * g + kv
+                        WSP_GGML_F32X temp_vec = WSP_GGML_F32X_FMA(kv_vec, prev_state_vec, g_vec);
+                        // Update dst: dst += temp * q
+                        dst_vec = WSP_GGML_F32X_FMA(dst_vec, temp_vec, q_vec);
+                        WSP_GGML_F32X_STORE(&dst_data[t_h_j_offset], dst_vec);
+                        // Update state
+                        WSP_GGML_F32X_STORE(&state_cur[h_2d_i_j_offset], temp_vec);
+                    }
+                    // Handle remaining elements, this will not be used.
+                    for (int64_t j = vec_count * GLA_VECTOR_SIZE; j < head_size; j++) {
+                        size_t t_h_j_offset = t_h_offset + j;
+                        size_t h_2d_i_j_offset = h_2d_i_offset + j;
+                        float v_val = v[t_h_j_offset];
+                        float kv_val = v_val * k_val;
+                        float prev_state_val = state_prev[h_2d_i_j_offset];
+                        float temp_val = kv_val + prev_state_val * g_val;
+                        dst_data[t_h_j_offset] += temp_val * q_val;
+                        state_cur[h_2d_i_j_offset] = temp_val;
+                    }
+                }
+            }
+        }
+    #else
+        for (int64_t t = 0; t < T; t++) {
+            size_t t_offset = t * t_stride;
+            size_t state_offset = head_size * C * (t / (T / n_seqs));
+            float * state_cur = state + state_offset;
+            float * state_prev = t % (T / n_seqs) ? state_cur : (float*)dst->src[4]->data + state_offset;
+            for (int64_t h = h_start; h < h_end; h++) {
+                size_t h_offset = h * h_stride;
+                size_t t_h_offset = t_offset + h_offset;
+                size_t h_2d_offset = h * h_stride_2d;
+                for (int64_t i = 0; i < head_size; i++) {
+                    size_t t_h_i_offset = t_h_offset + i;
+                    size_t h_2d_i_offset = h_2d_offset + i * h_stride;
+                    float k_val = k[t_h_i_offset];
+                    float q_val = q[t_h_i_offset] * scale;
+                    float g_val = g[t_h_i_offset];
+                    for (int64_t j = 0; j < head_size; j++) {
+                        size_t t_h_j_offset = t_h_offset + j;
+                        size_t h_2d_i_j_offset = h_2d_i_offset + j;
+                        float v_val = v[t_h_j_offset];
+                        float kv_val = v_val * k_val;
+                        float prev_state_val = state_prev[h_2d_i_j_offset];
+                        float temp_val = prev_state_val * g_val + kv_val;
+                        dst_data[t_h_j_offset] += temp_val * q_val;
+                        state_cur[h_2d_i_j_offset] = temp_val;
+                    }
+                }
+            }
+        }
+    #endif
+}
+static void wsp_ggml_compute_forward_gla(
+        const struct wsp_ggml_compute_params * params,
+        struct wsp_ggml_tensor * dst) {
+    const struct wsp_ggml_tensor * src0 = dst->src[0];
+    switch (src0->type) {
+        case WSP_GGML_TYPE_F32:
+            {
+                wsp_ggml_compute_forward_gla_f32(params, dst);
+            } break;
+        default:
+            {
+                WSP_GGML_ABORT("fatal error");
+            }
+    }
+}
 // wsp_ggml_compute_forward_map_unary
 static void wsp_ggml_compute_forward_map_unary_f32(
@@ -12135,22 +12931,22 @@ static void wsp_ggml_compute_forward_cross_entropy_loss_back_f32(
         const struct wsp_ggml_compute_params * params,
         struct wsp_ggml_tensor * dst) {
-    const struct wsp_ggml_tensor * src0 = dst->src[0];
-    const struct wsp_ggml_tensor * src1 = dst->src[1];
-    const struct wsp_ggml_tensor * opt0 = dst->src[2];
+    const struct wsp_ggml_tensor * grad  = dst->src[0]; // gradient of forward pass output
+    const struct wsp_ggml_tensor * src0f = dst->src[1]; // src0 of forward pass
+    const struct wsp_ggml_tensor * src1f = dst->src[2]; // src1 of forward pass
     WSP_GGML_ASSERT(wsp_ggml_is_contiguous(dst));
-    WSP_GGML_ASSERT(wsp_ggml_is_contiguous(src0));
-    WSP_GGML_ASSERT(wsp_ggml_is_contiguous(src1));
-    WSP_GGML_ASSERT(wsp_ggml_is_contiguous(opt0));
-    WSP_GGML_ASSERT(wsp_ggml_are_same_shape(src0, src1) && wsp_ggml_are_same_shape(src0, dst));
+    WSP_GGML_ASSERT(wsp_ggml_is_contiguous(src0f));
+    WSP_GGML_ASSERT(wsp_ggml_is_contiguous(src1f));
+    WSP_GGML_ASSERT(wsp_ggml_is_contiguous(grad));
+    WSP_GGML_ASSERT(wsp_ggml_are_same_shape(src0f, src1f) && wsp_ggml_are_same_shape(src0f, dst));
     const int64_t ith = params->ith;
     const int64_t nth = params->nth;
     // TODO: handle transposed/permuted matrices
-    const int64_t nc = src0->ne[0];
-    const int64_t nr = wsp_ggml_nrows(src0);
+    const int64_t nc = src0f->ne[0];
+    const int64_t nr = wsp_ggml_nrows(src0f);
     // rows per thread
     const int64_t dr = (nr + nth - 1)/nth;
@@ -12159,12 +12955,12 @@ static void wsp_ggml_compute_forward_cross_entropy_loss_back_f32(
     const int64_t ir0 = dr*ith;
     const int64_t ir1 = MIN(ir0 + dr, nr);
-    const float d_by_nr = ((const float *) opt0->data)[0] / (float) nr;
+    const float d_by_nr = ((const float *) grad->data)[0] / (float) nr;
     for (int64_t i1 = ir0; i1 < ir1; i1++) {
-        float * ds0 = (float *)((char *) dst->data  + i1*dst->nb[1]);
-        float * s0  = (float *)((char *) src0->data + i1*src0->nb[1]);
-        float * s1  = (float *)((char *) src1->data + i1*src1->nb[1]);
+        float       * ds0 = (float       *)((char       *) dst->data   + i1*dst->nb[1]);
+        const float * s0  = (const float *)((const char *) src0f->data + i1*src0f->nb[1]);
+        const float * s1  = (const float *)((const char *) src1f->data + i1*src1f->nb[1]);
 #ifndef NDEBUG
         for (int64_t i = 0; i < nc; ++i) {
@@ -12177,11 +12973,11 @@ static void wsp_ggml_compute_forward_cross_entropy_loss_back_f32(
         // soft_max
         float max = -INFINITY;
         wsp_ggml_vec_max_f32(nc, &max, s0);
-        wsp_ggml_float sum = wsp_ggml_vec_soft_max_f32(nc, ds0, s0, max);
+        const wsp_ggml_float sum = wsp_ggml_vec_soft_max_f32(nc, ds0, s0, max);
         assert(sum > 0.0);
         wsp_ggml_vec_scale_f32(nc, ds0, 1.0/sum);
-        // grad(src0) = (softmax(src0) - src1) * grad(cross_entropy_loss(src0, src1)) / nr
+        // grad(src0f) = (softmax(src0f) - src1f) * grad(cross_entropy_loss(src0f, src1f)) / nr
         wsp_ggml_vec_sub_f32(nc, ds0, ds0, s1);
         wsp_ggml_vec_scale_f32(nc, ds0, d_by_nr);
@@ -12304,6 +13100,9 @@ static void wsp_ggml_compute_forward(struct wsp_ggml_compute_params * params, st
         return;
     }
+    // extra_buffer op?
+    if (wsp_ggml_cpu_extra_compute_forward(params, tensor)) return;
     switch (tensor->op) {
         case WSP_GGML_OP_DUP:
             {
@@ -12475,7 +13274,7 @@ static void wsp_ggml_compute_forward(struct wsp_ggml_compute_params * params, st
             } break;
         case WSP_GGML_OP_SOFT_MAX_BACK:
             {
-                wsp_ggml_compute_forward_soft_max_back(params, tensor);
+                wsp_ggml_compute_forward_soft_max_ext_back(params, tensor);
             } break;
         case WSP_GGML_OP_ROPE:
             {
@@ -12525,6 +13324,10 @@ static void wsp_ggml_compute_forward(struct wsp_ggml_compute_params * params, st
             {
                 wsp_ggml_compute_forward_pad(params, tensor);
             } break;
+        case WSP_GGML_OP_PAD_REFLECT_1D:
+            {
+                wsp_ggml_compute_forward_pad_reflect_1d(params, tensor);
+            } break;
         case WSP_GGML_OP_ARANGE:
             {
                 wsp_ggml_compute_forward_arange(params, tensor);
@@ -12584,6 +13387,10 @@ static void wsp_ggml_compute_forward(struct wsp_ggml_compute_params * params, st
             {
                 wsp_ggml_compute_forward_rwkv_wkv6(params, tensor);
             } break;
+        case WSP_GGML_OP_GATED_LINEAR_ATTN:
+            {
+                wsp_ggml_compute_forward_gla(params, tensor);
+            } break;
         case WSP_GGML_OP_MAP_UNARY:
             {
                 wsp_ggml_unary_op_f32_t fun;
@@ -12867,6 +13674,7 @@ static int wsp_ggml_get_n_tasks(struct wsp_ggml_tensor * node, int n_threads) {
             } break;
         case WSP_GGML_OP_UPSCALE:
         case WSP_GGML_OP_PAD:
+        case WSP_GGML_OP_PAD_REFLECT_1D:
         case WSP_GGML_OP_ARANGE:
         case WSP_GGML_OP_TIMESTEP_EMBEDDING:
         case WSP_GGML_OP_ARGSORT:
@@ -12881,6 +13689,7 @@ static int wsp_ggml_get_n_tasks(struct wsp_ggml_tensor * node, int n_threads) {
         case WSP_GGML_OP_WIN_UNPART:
         case WSP_GGML_OP_GET_REL_POS:
         case WSP_GGML_OP_RWKV_WKV6:
+        case WSP_GGML_OP_GATED_LINEAR_ATTN:
         case WSP_GGML_OP_MAP_UNARY:
         case WSP_GGML_OP_MAP_BINARY:
         case WSP_GGML_OP_MAP_CUSTOM1_F32:
@@ -12956,7 +13765,7 @@ static thread_ret_t wsp_ggml_graph_compute_secondary_thread(void* data);
 #include "windows.h"
 // TODO: support > 64 CPUs
-bool wsp_ggml_thread_apply_affinity(bool * mask) {
+static bool wsp_ggml_thread_apply_affinity(bool * mask) {
     HANDLE    h = GetCurrentThread();
     uint64_t  bitmask = 0ULL;
@@ -13246,140 +14055,148 @@ struct wsp_ggml_cplan wsp_ggml_graph_plan(
         size_t cur = 0;
-        switch (node->op) {
-            case WSP_GGML_OP_CPY:
-            case WSP_GGML_OP_DUP:
-                {
-                    if (wsp_ggml_is_quantized(node->type) ||
-                        // F16 -> BF16 and BF16 -> F16 copies go through intermediate F32
-                        (node->src[0]->type == WSP_GGML_TYPE_F16  && node->src[1] && node->src[1]->type == WSP_GGML_TYPE_BF16) ||
-                        (node->src[0]->type == WSP_GGML_TYPE_BF16 && node->src[1] && node->src[1]->type == WSP_GGML_TYPE_F16)) {
+        if (!wsp_ggml_cpu_extra_work_size(n_threads, node, &cur)) {
+            switch (node->op) {
+                case WSP_GGML_OP_CPY:
+                case WSP_GGML_OP_DUP:
+                    {
+                        if (wsp_ggml_is_quantized(node->type) ||
+                            // F16 -> BF16 and BF16 -> F16 copies go through intermediate F32
+                            (node->src[0]->type == WSP_GGML_TYPE_F16  && node->src[1] && node->src[1]->type == WSP_GGML_TYPE_BF16) ||
+                            (node->src[0]->type == WSP_GGML_TYPE_BF16 && node->src[1] && node->src[1]->type == WSP_GGML_TYPE_F16)) {
+                            cur = wsp_ggml_type_size(WSP_GGML_TYPE_F32) * node->ne[0] * n_tasks;
+                        }
+                    } break;
+                case WSP_GGML_OP_ADD:
+                case WSP_GGML_OP_ADD1:
+                    {
+                        if (wsp_ggml_is_quantized(node->src[0]->type)) {
+                            cur = wsp_ggml_type_size(WSP_GGML_TYPE_F32) * node->src[0]->ne[0] * n_tasks;
+                        }
+                    } break;
+                case WSP_GGML_OP_ACC:
+                    {
+                        if (wsp_ggml_is_quantized(node->src[0]->type)) {
+                            cur = wsp_ggml_type_size(WSP_GGML_TYPE_F32) * node->src[1]->ne[0] * n_tasks;
+                        }
+                    } break;
+                case WSP_GGML_OP_COUNT_EQUAL:
+                    {
+                        cur = wsp_ggml_type_size(node->type)*n_tasks;
+                    } break;
+                case WSP_GGML_OP_MUL_MAT:
+                    {
+                        const enum wsp_ggml_type vec_dot_type = type_traits_cpu[node->src[0]->type].vec_dot_type;
+                        if (node->src[1]->type != vec_dot_type) {
+                            cur = wsp_ggml_row_size(vec_dot_type, wsp_ggml_nelements(node->src[1]));
+                        }
+                    } break;
+                case WSP_GGML_OP_MUL_MAT_ID:
+                    {
+                        cur = 0;
+                        const struct wsp_ggml_tensor * src0 = node->src[0];
+                        const struct wsp_ggml_tensor * src1 = node->src[1];
+                        const struct wsp_ggml_tensor * ids = node->src[2];
+                        const enum wsp_ggml_type vec_dot_type = type_traits_cpu[src0->type].vec_dot_type;
+                        const int n_as = src0->ne[2];
+                        // src1
+                        if (src1->type != vec_dot_type) {
+                            cur += wsp_ggml_row_size(vec_dot_type, wsp_ggml_nelements(src1)) + sizeof(int64_t);
+                        }
+                        // matrix_row_counts
+                        cur += n_as * sizeof(int64_t) + sizeof(int64_t);
+                        // matrix_rows
+                        cur += n_as*ids->ne[0]*ids->ne[1]*sizeof(struct mmid_row_mapping) + sizeof(int64_t);
+                        // atomic_current_chunk
+                        cur += CACHE_LINE_SIZE*n_as + CACHE_LINE_SIZE;
+                    } break;
+                case WSP_GGML_OP_OUT_PROD:
+                    {
+                        if (wsp_ggml_is_quantized(node->src[0]->type)) {
+                            cur = wsp_ggml_type_size(WSP_GGML_TYPE_F32) * node->src[0]->ne[0] * n_tasks;
+                        }
+                    } break;
+                case WSP_GGML_OP_SOFT_MAX:
+                case WSP_GGML_OP_ROPE:
+                case WSP_GGML_OP_ROPE_BACK:
+                    {
                         cur = wsp_ggml_type_size(WSP_GGML_TYPE_F32) * node->ne[0] * n_tasks;
-                    }
-                } break;
-            case WSP_GGML_OP_ADD:
-            case WSP_GGML_OP_ADD1:
-                {
-                    if (wsp_ggml_is_quantized(node->src[0]->type)) {
-                        cur = wsp_ggml_type_size(WSP_GGML_TYPE_F32) * node->src[0]->ne[0] * n_tasks;
-                    }
-                } break;
-            case WSP_GGML_OP_ACC:
-                {
-                    if (wsp_ggml_is_quantized(node->src[0]->type)) {
-                        cur = wsp_ggml_type_size(WSP_GGML_TYPE_F32) * node->src[1]->ne[0] * n_tasks;
-                    }
-                } break;
-            case WSP_GGML_OP_COUNT_EQUAL:
-                {
-                    cur = wsp_ggml_type_size(node->type)*n_tasks;
-                } break;
-            case WSP_GGML_OP_MUL_MAT:
-                {
-                    const enum wsp_ggml_type vec_dot_type = type_traits_cpu[node->src[0]->type].vec_dot_type;
+                    } break;
+                case WSP_GGML_OP_CONV_TRANSPOSE_1D:
+                    {
+                        WSP_GGML_ASSERT(node->src[0]->ne[3] == 1);
+                        WSP_GGML_ASSERT(node->src[1]->ne[2] == 1);
+                        WSP_GGML_ASSERT(node->src[1]->ne[3] == 1);
+                        const int64_t ne00 = node->src[0]->ne[0];  // K
+                        const int64_t ne01 = node->src[0]->ne[1];  // Cout
+                        const int64_t ne02 = node->src[0]->ne[2];  // Cin
+                        const int64_t ne10 = node->src[1]->ne[0];  // L
+                        const int64_t ne11 = node->src[1]->ne[1];  // Cin
+                        if ((node->src[0]->type == WSP_GGML_TYPE_F16 ||
+                             node->src[0]->type == WSP_GGML_TYPE_BF16) &&
+                            node->src[1]->type == WSP_GGML_TYPE_F32) {
+                            cur += sizeof(wsp_ggml_fp16_t)*ne00*ne01*ne02;
+                            cur += sizeof(wsp_ggml_fp16_t)*ne10*ne11;
+                        } else if (node->src[0]->type == WSP_GGML_TYPE_F32 &&
+                                   node->src[1]->type == WSP_GGML_TYPE_F32) {
+                            cur += sizeof(float)*ne00*ne01*ne02;
+                            cur += sizeof(float)*ne10*ne11;
+                        } else {
+                            WSP_GGML_ABORT("fatal error");
+                        }
+                    } break;
+                case WSP_GGML_OP_CONV_TRANSPOSE_2D:
+                    {
+                        const int64_t ne00 = node->src[0]->ne[0]; // W
+                        const int64_t ne01 = node->src[0]->ne[1]; // H
+                        const int64_t ne02 = node->src[0]->ne[2]; // Channels Out
+                        const int64_t ne03 = node->src[0]->ne[3]; // Channels In
-                    if (node->src[1]->type != vec_dot_type) {
-                        cur = wsp_ggml_row_size(vec_dot_type, wsp_ggml_nelements(node->src[1]));
-                    }
-                } break;
-            case WSP_GGML_OP_MUL_MAT_ID:
-                {
-                    cur = 0;
-                    const struct wsp_ggml_tensor * src0 = node->src[0];
-                    const struct wsp_ggml_tensor * src1 = node->src[1];
-                    const enum wsp_ggml_type vec_dot_type = type_traits_cpu[src0->type].vec_dot_type;
-                    if (src1->type != vec_dot_type) {
-                        cur += wsp_ggml_row_size(vec_dot_type, wsp_ggml_nelements(src1));
-                    }
-                    const int n_as = src0->ne[2];
-                    cur += WSP_GGML_PAD(cur, sizeof(int64_t));       // align
-                    cur += n_as * sizeof(int64_t);               // matrix_row_counts
-                    cur += n_as * src1->ne[2] * sizeof(int64_t); // matrix_rows
-                } break;
-            case WSP_GGML_OP_OUT_PROD:
-                {
-                    if (wsp_ggml_is_quantized(node->src[0]->type)) {
-                        cur = wsp_ggml_type_size(WSP_GGML_TYPE_F32) * node->src[0]->ne[0] * n_tasks;
-                    }
-                } break;
-            case WSP_GGML_OP_SOFT_MAX:
-            case WSP_GGML_OP_ROPE:
-                {
-                    cur = wsp_ggml_type_size(WSP_GGML_TYPE_F32) * node->ne[0] * n_tasks;
-                } break;
-            case WSP_GGML_OP_CONV_TRANSPOSE_1D:
-                {
-                    WSP_GGML_ASSERT(node->src[0]->ne[3] == 1);
-                    WSP_GGML_ASSERT(node->src[1]->ne[2] == 1);
-                    WSP_GGML_ASSERT(node->src[1]->ne[3] == 1);
-                    const int64_t ne00 = node->src[0]->ne[0];  // K
-                    const int64_t ne01 = node->src[0]->ne[1];  // Cout
-                    const int64_t ne02 = node->src[0]->ne[2];  // Cin
-                    const int64_t ne10 = node->src[1]->ne[0];  // L
-                    const int64_t ne11 = node->src[1]->ne[1];  // Cin
-                    if ((node->src[0]->type == WSP_GGML_TYPE_F16 ||
-                         node->src[0]->type == WSP_GGML_TYPE_BF16) &&
-                        node->src[1]->type == WSP_GGML_TYPE_F32) {
-                        cur += sizeof(wsp_ggml_fp16_t)*ne00*ne01*ne02;
-                        cur += sizeof(wsp_ggml_fp16_t)*ne10*ne11;
-                    } else if (node->src[0]->type == WSP_GGML_TYPE_F32 &&
-                               node->src[1]->type == WSP_GGML_TYPE_F32) {
-                        cur += sizeof(float)*ne00*ne01*ne02;
-                        cur += sizeof(float)*ne10*ne11;
-                    } else {
-                        WSP_GGML_ABORT("fatal error");
-                    }
-                } break;
-            case WSP_GGML_OP_CONV_TRANSPOSE_2D:
-                {
-                    const int64_t ne00 = node->src[0]->ne[0]; // W
-                    const int64_t ne01 = node->src[0]->ne[1]; // H
-                    const int64_t ne02 = node->src[0]->ne[2]; // Channels Out
-                    const int64_t ne03 = node->src[0]->ne[3]; // Channels In
-                    const int64_t ne10 = node->src[1]->ne[0]; // W
-                    const int64_t ne11 = node->src[1]->ne[1]; // H
-                    const int64_t ne12 = node->src[1]->ne[2]; // Channels In
-                    cur += sizeof(wsp_ggml_fp16_t)*ne00*ne01*ne02*ne03;
-                    cur += sizeof(wsp_ggml_fp16_t)*ne10*ne11*ne12;
-                } break;
-            case WSP_GGML_OP_FLASH_ATTN_EXT:
-                {
-                    const int64_t ne00 = node->src[0]->ne[0]; // D
+                        const int64_t ne10 = node->src[1]->ne[0]; // W
+                        const int64_t ne11 = node->src[1]->ne[1]; // H
+                        const int64_t ne12 = node->src[1]->ne[2]; // Channels In
-                    cur = 3*sizeof(float)*ne00*n_tasks; // 3x head size/thread
-                } break;
-            case WSP_GGML_OP_FLASH_ATTN_BACK:
-                {
-                    const int64_t    D = node->src[0]->ne[0];
-                    const int64_t ne11 = wsp_ggml_up(node->src[1]->ne[1], WSP_GGML_SOFT_MAX_UNROLL);
-                    const int64_t mxDn = MAX(D, ne11) * 2; // *2 because of S and SM in wsp_ggml_compute_forward_flash_attn_back
-                    if (node->src[1]->type == WSP_GGML_TYPE_F32) {
-                        cur  = sizeof(float)*mxDn*n_tasks; // TODO: this can become (n_tasks-1)
-                        cur += sizeof(float)*mxDn*n_tasks; // this is overestimated by x2
-                    } else if (node->src[1]->type == WSP_GGML_TYPE_F16) {
-                        cur  = sizeof(float)*mxDn*n_tasks; // TODO: this can become (n_tasks-1)
-                        cur += sizeof(float)*mxDn*n_tasks; // this is overestimated by x2
-                    } else if (node->src[1]->type == WSP_GGML_TYPE_BF16) {
-                        cur  = sizeof(float)*mxDn*n_tasks; // TODO: this can become (n_tasks-1)
-                        cur += sizeof(float)*mxDn*n_tasks; // this is overestimated by x2
-                    }
-                } break;
+                        cur += sizeof(wsp_ggml_fp16_t)*ne00*ne01*ne02*ne03;
+                        cur += sizeof(wsp_ggml_fp16_t)*ne10*ne11*ne12;
+                    } break;
+                case WSP_GGML_OP_FLASH_ATTN_EXT:
+                    {
+                        const int64_t ne00 = node->src[0]->ne[0]; // D
-            case WSP_GGML_OP_CROSS_ENTROPY_LOSS:
-                {
-                    cur = wsp_ggml_type_size(node->type)*(n_tasks + node->src[0]->ne[0]*n_tasks);
-                } break;
-            case WSP_GGML_OP_COUNT:
-                {
-                    WSP_GGML_ABORT("fatal error");
-                }
-            default:
-                break;
+                        cur = 3*sizeof(float)*ne00*n_tasks; // 3x head size/thread
+                    } break;
+                case WSP_GGML_OP_FLASH_ATTN_BACK:
+                    {
+                        const int64_t    D = node->src[0]->ne[0];
+                        const int64_t ne11 = wsp_ggml_up(node->src[1]->ne[1], WSP_GGML_SOFT_MAX_UNROLL);
+                        const int64_t mxDn = MAX(D, ne11) * 2; // *2 because of S and SM in wsp_ggml_compute_forward_flash_attn_back
+                        if (node->src[1]->type == WSP_GGML_TYPE_F32) {
+                            cur  = sizeof(float)*mxDn*n_tasks; // TODO: this can become (n_tasks-1)
+                            cur += sizeof(float)*mxDn*n_tasks; // this is overestimated by x2
+                        } else if (node->src[1]->type == WSP_GGML_TYPE_F16) {
+                            cur  = sizeof(float)*mxDn*n_tasks; // TODO: this can become (n_tasks-1)
+                            cur += sizeof(float)*mxDn*n_tasks; // this is overestimated by x2
+                        } else if (node->src[1]->type == WSP_GGML_TYPE_BF16) {
+                            cur  = sizeof(float)*mxDn*n_tasks; // TODO: this can become (n_tasks-1)
+                            cur += sizeof(float)*mxDn*n_tasks; // this is overestimated by x2
+                        }
+                    } break;
+                case WSP_GGML_OP_CROSS_ENTROPY_LOSS:
+                    {
+                        cur = wsp_ggml_type_size(node->type)*(n_tasks + node->src[0]->ne[0]*n_tasks);
+                    } break;
+                case WSP_GGML_OP_COUNT:
+                    {
+                        WSP_GGML_ABORT("fatal error");
+                    }
+                default:
+                    break;
+            }
         }
         work_size = MAX(work_size, cur);
@@ -13414,20 +14231,24 @@ static thread_ret_t wsp_ggml_graph_compute_thread(void * data) {
         /*.threadpool=*/ tp,
     };
-    for (int node_n = 0; node_n < cgraph->n_nodes && !tp->abort; node_n++) {
+    for (int node_n = 0; node_n < cgraph->n_nodes && atomic_load_explicit(&tp->abort, memory_order_relaxed) != node_n; node_n++) {
         struct wsp_ggml_tensor * node = cgraph->nodes[node_n];
         wsp_ggml_compute_forward(&params, node);
         if (state->ith == 0 && cplan->abort_callback &&
                 cplan->abort_callback(cplan->abort_callback_data)) {
-            tp->abort = true;
+            atomic_store_explicit(&tp->abort, node_n + 1, memory_order_relaxed);
             tp->ec    = WSP_GGML_STATUS_ABORTED;
         }
-        wsp_ggml_barrier(state->threadpool);
+        if (node_n + 1 < cgraph->n_nodes) {
+            wsp_ggml_barrier(state->threadpool);
+        }
     }
+    wsp_ggml_barrier(state->threadpool);
     return 0;
 }
@@ -13578,29 +14399,6 @@ static void wsp_ggml_graph_compute_kickoff(struct wsp_ggml_threadpool * threadpo
 #endif // WSP_GGML_USE_OPENMP
-void wsp_ggml_threadpool_params_init(struct wsp_ggml_threadpool_params * p, int n_threads) {
-    p->n_threads  = n_threads;
-    p->prio       = 0;     // default priority (usually means normal or inherited)
-    p->poll       = 50;    // hybrid-polling enabled
-    p->strict_cpu = false; // no strict placement (all threads share same cpumask)
-    p->paused     = false; // threads are ready to go
-    memset(p->cpumask, 0, WSP_GGML_MAX_N_THREADS); // all-zero means use the default affinity (usually inherited)
-}
-struct wsp_ggml_threadpool_params wsp_ggml_threadpool_params_default(int n_threads) {
-    struct wsp_ggml_threadpool_params p;
-    wsp_ggml_threadpool_params_init(&p, n_threads);
-    return p;
-}
-bool wsp_ggml_threadpool_params_match(const struct wsp_ggml_threadpool_params * p0, const struct wsp_ggml_threadpool_params * p1) {
-    if (p0->n_threads      != p1->n_threads  )    return false;
-    if (p0->prio           != p1->prio       )    return false;
-    if (p0->poll           != p1->poll       )    return false;
-    if (p0->strict_cpu     != p1->strict_cpu )    return false;
-    return memcmp(p0->cpumask, p1->cpumask, WSP_GGML_MAX_N_THREADS) == 0;
-}
 static struct wsp_ggml_threadpool * wsp_ggml_threadpool_new_impl(
     struct wsp_ggml_threadpool_params * tpp,
                struct wsp_ggml_cgraph * cgraph,
@@ -13617,7 +14415,7 @@ static struct wsp_ggml_threadpool * wsp_ggml_threadpool_new_impl(
         threadpool->current_chunk    = 0;
         threadpool->stop             = false;
         threadpool->pause            = tpp->paused;
-        threadpool->abort            = false;
+        threadpool->abort            = -1;
         threadpool->workers          = NULL;
         threadpool->n_threads_max    = tpp->n_threads;
         threadpool->n_threads_cur    = tpp->n_threads;
@@ -13696,7 +14494,7 @@ enum wsp_ggml_status wsp_ggml_graph_compute(struct wsp_ggml_cgraph * cgraph, str
         threadpool->cgraph           = cgraph;
         threadpool->cplan            = cplan;
         threadpool->current_chunk    = 0;
-        threadpool->abort            = false;
+        threadpool->abort            = -1;
         threadpool->ec               = WSP_GGML_STATUS_SUCCESS;
     }
@@ -13895,16 +14693,32 @@ int wsp_ggml_cpu_has_vsx(void) {
 #endif
 }
+int wsp_ggml_cpu_has_vxe(void) {
+#if defined(__VXE__) || defined(__VXE2__)
+    return 1;
+#else
+    return 0;
+#endif
+}
 int wsp_ggml_cpu_has_neon(void) {
-#if defined(__ARM_ARCH)
+#if defined(__ARM_ARCH) && defined(__ARM_NEON)
     return wsp_ggml_arm_arch_features.has_neon;
 #else
     return 0;
 #endif
 }
+int wsp_ggml_cpu_has_dotprod(void) {
+#if defined(__ARM_ARCH) && defined(__ARM_FEATURE_DOTPROD)
+    return wsp_ggml_arm_arch_features.has_dotprod;
+#else
+    return 0;
+#endif
+}
 int wsp_ggml_cpu_has_sve(void) {
-#if defined(__ARM_ARCH)
+#if defined(__ARM_ARCH) && defined(__ARM_FEATURE_SVE)
     return wsp_ggml_arm_arch_features.has_sve;
 #else
     return 0;
@@ -13912,7 +14726,7 @@ int wsp_ggml_cpu_has_sve(void) {
 }
 int wsp_ggml_cpu_has_matmul_int8(void) {
-#if defined(__ARM_ARCH)
+#if defined(__ARM_ARCH) && defined(__ARM_FEATURE_MATMUL_INT8)
     return wsp_ggml_arm_arch_features.has_i8mm;
 #else
     return 0;
@@ -13920,13 +14734,21 @@ int wsp_ggml_cpu_has_matmul_int8(void) {
 }
 int wsp_ggml_cpu_get_sve_cnt(void) {
-#if defined(__ARM_ARCH)
+#if defined(__ARM_ARCH) && defined(__ARM_FEATURE_SVE)
     return wsp_ggml_arm_arch_features.sve_cnt;
 #else
     return 0;
 #endif
 }
+int wsp_ggml_cpu_has_sme(void) {
+#if defined(__ARM_ARCH) && defined(__ARM_FEATURE_SME)
+    return wsp_ggml_arm_arch_features.has_sme;
+#else
+    return 0;
+#endif
+}
 void wsp_ggml_cpu_init(void) {
     // needed to initialize f16 tables
     {