npm - whisper.rn - Versions diffs - 0.4.0-rc.10 → 0.4.0-rc.12 - Mend

whisper.rn 0.4.0-rc.10 → 0.4.0-rc.12

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (46) hide show

package/android/src/main/CMakeLists.txt +9 -3
package/cpp/amx/amx.cpp +220 -0
package/cpp/amx/amx.h +8 -0
package/cpp/amx/common.h +91 -0
package/cpp/amx/mmq.cpp +2511 -0
package/cpp/amx/mmq.h +10 -0
package/cpp/ggml-alloc.c +6 -14
package/cpp/ggml-backend-impl.h +50 -11
package/cpp/ggml-backend-reg.cpp +409 -31
package/cpp/ggml-backend.cpp +9 -3
package/cpp/ggml-backend.h +18 -0
package/cpp/ggml-common.h +41 -43
package/cpp/ggml-cpp.h +1 -0
package/cpp/{ggml-cpu-aarch64.c → ggml-cpu-aarch64.cpp} +941 -254
package/cpp/ggml-cpu-aarch64.h +2 -24
package/cpp/ggml-cpu-impl.h +171 -11
package/cpp/ggml-cpu-quants.c +1812 -389
package/cpp/ggml-cpu-traits.cpp +36 -0
package/cpp/ggml-cpu-traits.h +38 -0
package/cpp/ggml-cpu.c +1432 -610
package/cpp/ggml-cpu.cpp +131 -141
package/cpp/ggml-cpu.h +10 -50
package/cpp/ggml-impl.h +27 -11
package/cpp/ggml-metal-impl.h +39 -0
package/cpp/ggml-metal.h +1 -1
package/cpp/ggml-metal.m +1031 -359
package/cpp/ggml-opt.cpp +854 -0
package/cpp/ggml-opt.h +216 -0
package/cpp/ggml-quants.c +0 -9
package/cpp/ggml-threading.h +4 -2
package/cpp/ggml-whisper.metallib +0 -0
package/cpp/ggml.c +501 -1537
package/cpp/ggml.h +144 -171
package/cpp/gguf.cpp +1329 -0
package/cpp/gguf.h +202 -0
package/cpp/whisper.cpp +254 -114
package/cpp/whisper.h +6 -3
package/lib/commonjs/version.json +1 -1
package/lib/module/version.json +1 -1
package/package.json +2 -1
package/src/version.json +1 -1
package/whisper-rn.podspec +2 -2
package/cpp/README.md +0 -4
package/cpp/ggml-aarch64.c +0 -129
package/cpp/ggml-aarch64.h +0 -19
package/cpp/ggml-backend.cpp.rej +0 -12

package/cpp/ggml.c CHANGED Viewed

@@ -8,7 +8,10 @@
 // FIXME: required here for quantization functions
 #include "ggml-quants.h"
-#include "ggml-aarch64.h"
+#ifdef WSP_GGML_USE_CPU_HBM
+#include <hbwmalloc.h>
+#endif
 #if defined(_MSC_VER) || defined(__MINGW32__)
 #include <malloc.h> // using malloc.h with MSC/MINGW
@@ -125,6 +128,10 @@ static void wsp_ggml_print_backtrace_symbols(void) {
 #endif
 static void wsp_ggml_print_backtrace(void) {
+    const char * WSP_GGML_NO_BACKTRACE = getenv("WSP_GGML_NO_BACKTRACE");
+    if (WSP_GGML_NO_BACKTRACE) {
+        return;
+    }
     char attach[32];
     snprintf(attach, sizeof(attach), "attach %d", getpid());
     int pid = fork();
@@ -233,7 +240,11 @@ void wsp_ggml_log_callback_default(enum wsp_ggml_log_level level, const char * t
 void * wsp_ggml_aligned_malloc(size_t size) {
+#if defined(__s390x__)
+    const int alignment = 256;
+#else
     const int alignment = 64;
+#endif
 #if defined(_MSC_VER) || defined(__MINGW32__)
     return _aligned_malloc(size, alignment);
@@ -788,32 +799,23 @@ static const struct wsp_ggml_type_traits type_traits[WSP_GGML_TYPE_COUNT] = {
         .to_float                 = (wsp_ggml_to_float_t) wsp_ggml_bf16_to_fp32_row,
         .from_float_ref           = (wsp_ggml_from_float_t) wsp_ggml_fp32_to_bf16_row_ref,
     },
-    [WSP_GGML_TYPE_Q4_0_4_4] = {
-        .type_name                = "q4_0_4x4",
-        .blck_size                = QK4_0,
-        .blck_size_interleave     = 4,
-        .type_size                = sizeof(block_q4_0),
-        .is_quantized             = true,
-        .to_float                 = NULL,
-        .from_float_ref           = NULL,
+    [31] = { // WSP_GGML_TYPE_Q4_0_4_4
+        .type_name                = "TYPE_Q4_0_4_4 REMOVED, use Q4_0 with runtime repacking",
+        .blck_size                = 0,
+        .type_size                = 0,
+        .is_quantized             = false,
     },
-    [WSP_GGML_TYPE_Q4_0_4_8] = {
-        .type_name                = "q4_0_4x8",
-        .blck_size                = QK4_0,
-        .blck_size_interleave     = 8,
-        .type_size                = sizeof(block_q4_0),
-        .is_quantized             = true,
-        .to_float                 = NULL,
-        .from_float_ref           = NULL,
+    [32] = { // WSP_GGML_TYPE_Q4_0_4_8
+        .type_name                = "TYPE_Q4_0_4_8 REMOVED, use Q4_0 with runtime repacking",
+        .blck_size                = 0,
+        .type_size                = 0,
+        .is_quantized             = false,
     },
-    [WSP_GGML_TYPE_Q4_0_8_8] = {
-        .type_name                = "q4_0_8x8",
-        .blck_size                = QK4_0,
-        .blck_size_interleave     = 8,
-        .type_size                = sizeof(block_q4_0),
-        .is_quantized             = true,
-        .to_float                 = NULL,
-        .from_float_ref           = NULL,
+    [33] = { // WSP_GGML_TYPE_Q4_0_8_8
+        .type_name                = "TYPE_Q4_0_8_8 REMOVED, use Q4_0 with runtime repacking",
+        .blck_size                = 0,
+        .type_size                = 0,
+        .is_quantized             = false,
     },
     [WSP_GGML_TYPE_TQ1_0] = {
         .type_name                = "tq1_0",
@@ -831,6 +833,24 @@ static const struct wsp_ggml_type_traits type_traits[WSP_GGML_TYPE_COUNT] = {
         .to_float                 = (wsp_ggml_to_float_t) wsp_dewsp_quantize_row_tq2_0,
         .from_float_ref           = (wsp_ggml_from_float_t) wsp_quantize_row_tq2_0_ref,
     },
+    [36] = { // WSP_GGML_TYPE_IQ4_NL_4_4
+        .type_name                = "TYPE_IQ4_NL_4_4 REMOVED, use IQ4_NL with runtime repacking",
+        .blck_size                = 0,
+        .type_size                = 0,
+        .is_quantized             = false,
+    },
+    [37] = { // WSP_GGML_TYPE_IQ4_NL_4_8
+        .type_name                = "TYPE_IQ4_NL_4_8 REMOVED, use IQ4_NL with runtime repacking",
+        .blck_size                = 0,
+        .type_size                = 0,
+        .is_quantized             = false,
+    },
+    [38] = { // WSP_GGML_TYPE_IQ4_NL_8_8
+        .type_name                = "TYPE_IQ4_NL_8_8 REMOVED, use IQ4_NL with runtime repacking",
+        .blck_size                = 0,
+        .type_size                = 0,
+        .is_quantized             = false,
+    },
 };
 const struct wsp_ggml_type_traits * wsp_ggml_get_type_traits(enum wsp_ggml_type type) {
@@ -941,6 +961,7 @@ static const char * WSP_GGML_OP_NAME[WSP_GGML_OP_COUNT] = {
     "POOL_2D_BACK",
     "UPSCALE",
     "PAD",
+    "PAD_REFLECT_1D",
     "ARANGE",
     "TIMESTEP_EMBEDDING",
     "ARGSORT",
@@ -955,6 +976,7 @@ static const char * WSP_GGML_OP_NAME[WSP_GGML_OP_COUNT] = {
     "GET_REL_POS",
     "ADD_REL_POS",
     "RWKV_WKV6",
+    "GATED_LINEAR_ATTN",
     "UNARY",
@@ -974,7 +996,7 @@ static const char * WSP_GGML_OP_NAME[WSP_GGML_OP_COUNT] = {
     "OPT_STEP_ADAMW",
 };
-static_assert(WSP_GGML_OP_COUNT == 81, "WSP_GGML_OP_COUNT != 81");
+static_assert(WSP_GGML_OP_COUNT == 83, "WSP_GGML_OP_COUNT != 83");
 static const char * WSP_GGML_OP_SYMBOL[WSP_GGML_OP_COUNT] = {
     "none",
@@ -1036,6 +1058,7 @@ static const char * WSP_GGML_OP_SYMBOL[WSP_GGML_OP_COUNT] = {
     "pool_2d_back(x)",
     "upscale(x)",
     "pad(x)",
+    "pad_reflect_1d(x)",
     "arange(start, stop, step)",
     "timestep_embedding(timesteps, dim, max_period)",
     "argsort(x)",
@@ -1050,6 +1073,7 @@ static const char * WSP_GGML_OP_SYMBOL[WSP_GGML_OP_COUNT] = {
     "get_rel_pos(x)",
     "add_rel_pos(x)",
     "rwkv_wkv6(k, v, r, tf, td, s)",
+    "gated_linear_attn(k, v, q, gate, s)",
     "unary(x)",
@@ -1069,7 +1093,7 @@ static const char * WSP_GGML_OP_SYMBOL[WSP_GGML_OP_COUNT] = {
     "adamw(x)",
 };
-static_assert(WSP_GGML_OP_COUNT == 81, "WSP_GGML_OP_COUNT != 81");
+static_assert(WSP_GGML_OP_COUNT == 83, "WSP_GGML_OP_COUNT != 83");
 static_assert(WSP_GGML_OP_POOL_COUNT == 2, "WSP_GGML_OP_POOL_COUNT != 2");
@@ -1259,9 +1283,6 @@ enum wsp_ggml_type wsp_ggml_ftype_to_wsp_ggml_type(enum wsp_ggml_ftype ftype) {
         case WSP_GGML_FTYPE_MOSTLY_IQ4_XS:        wtype = WSP_GGML_TYPE_IQ4_XS;   break;
         case WSP_GGML_FTYPE_MOSTLY_IQ3_S:         wtype = WSP_GGML_TYPE_IQ3_S;    break;
         case WSP_GGML_FTYPE_MOSTLY_IQ2_S:         wtype = WSP_GGML_TYPE_IQ2_S;    break;
-        case WSP_GGML_FTYPE_MOSTLY_Q4_0_4_4:      wtype = WSP_GGML_TYPE_Q4_0_4_4; break;
-        case WSP_GGML_FTYPE_MOSTLY_Q4_0_4_8:      wtype = WSP_GGML_TYPE_Q4_0_4_8; break;
-        case WSP_GGML_FTYPE_MOSTLY_Q4_0_8_8:      wtype = WSP_GGML_TYPE_Q4_0_8_8; break;
         case WSP_GGML_FTYPE_UNKNOWN:              wtype = WSP_GGML_TYPE_COUNT; break;
         case WSP_GGML_FTYPE_MOSTLY_Q4_1_SOME_F16: wtype = WSP_GGML_TYPE_COUNT; break;
     }
@@ -1362,7 +1383,7 @@ bool wsp_ggml_are_same_stride(const struct wsp_ggml_tensor * t0, const struct ws
         (t0->nb[3] == t1->nb[3]);
 }
-// check if t1 can be represented as a repeatition of t0
+// check if t1 can be represented as a repetition of t0
 bool wsp_ggml_can_repeat(const struct wsp_ggml_tensor * t0, const struct wsp_ggml_tensor * t1) {
     static_assert(WSP_GGML_MAX_DIMS == 4, "WSP_GGML_MAX_DIMS is not 4 - update this function");
@@ -1577,15 +1598,8 @@ static struct wsp_ggml_tensor * wsp_ggml_new_tensor_impl(
     struct wsp_ggml_tensor * const result = (struct wsp_ggml_tensor *)((char *)ctx->mem_buffer + obj_new->offs);
-#ifdef __clang__
-    // temporary until wsp_ggml_tensor::backend is removed
-    #pragma clang diagnostic push
-    #pragma clang diagnostic ignored "-Wdeprecated-declarations"
-#endif
     *result = (struct wsp_ggml_tensor) {
         /*.type         =*/ type,
-        /*.backend      =*/ WSP_GGML_BACKEND_TYPE_CPU,
         /*.buffer       =*/ NULL,
         /*.ne           =*/ { 1, 1, 1, 1 },
         /*.nb           =*/ { 0, 0, 0, 0 },
@@ -1601,10 +1615,6 @@ static struct wsp_ggml_tensor * wsp_ggml_new_tensor_impl(
         /*.padding      =*/ { 0 },
     };
-#ifdef __clang__
-    #pragma clang diagnostic pop
-#endif
     // TODO: this should not be needed as long as we don't rely on aligned SIMD loads
     //WSP_GGML_ASSERT_ALIGNED(result->data);
@@ -2255,6 +2265,7 @@ struct wsp_ggml_tensor * wsp_ggml_argmax(
         struct wsp_ggml_context * ctx,
         struct wsp_ggml_tensor  * a) {
     WSP_GGML_ASSERT(wsp_ggml_is_matrix(a));
+    WSP_GGML_ASSERT(a->ne[0] <= INT32_MAX);
     struct wsp_ggml_tensor * result = wsp_ggml_new_tensor_1d(ctx, WSP_GGML_TYPE_I32, a->ne[1]);
@@ -3447,12 +3458,14 @@ struct wsp_ggml_tensor * wsp_ggml_soft_max_ext(
     return wsp_ggml_soft_max_impl(ctx, a, mask, scale, max_bias, false);
 }
-// wsp_ggml_soft_max_back
+// wsp_ggml_soft_max_ext_back
-static struct wsp_ggml_tensor * wsp_ggml_soft_max_back_impl(
+static struct wsp_ggml_tensor * wsp_ggml_soft_max_ext_back_impl(
         struct wsp_ggml_context * ctx,
         struct wsp_ggml_tensor  * a,
         struct wsp_ggml_tensor  * b,
+        float                 scale,
+        float                 max_bias,
         bool                  inplace) {
     struct wsp_ggml_tensor * result = inplace ? wsp_ggml_view_tensor(ctx, a) : wsp_ggml_dup_tensor(ctx, a);
@@ -3460,21 +3473,28 @@ static struct wsp_ggml_tensor * wsp_ggml_soft_max_back_impl(
     result->src[0] = a;
     result->src[1] = b;
+    memcpy((float *) result->op_params + 0, &scale,    sizeof(float));
+    memcpy((float *) result->op_params + 1, &max_bias, sizeof(float));
     return result;
 }
-struct wsp_ggml_tensor * wsp_ggml_soft_max_back(
+struct wsp_ggml_tensor * wsp_ggml_soft_max_ext_back(
         struct wsp_ggml_context * ctx,
         struct wsp_ggml_tensor  * a,
-        struct wsp_ggml_tensor  * b) {
-    return wsp_ggml_soft_max_back_impl(ctx, a, b, false);
+        struct wsp_ggml_tensor  * b,
+        float                 scale,
+        float                 max_bias) {
+    return wsp_ggml_soft_max_ext_back_impl(ctx, a, b, scale, max_bias, false);
 }
-struct wsp_ggml_tensor * wsp_ggml_soft_max_back_inplace(
+struct wsp_ggml_tensor * wsp_ggml_soft_max_ext_back_inplace(
         struct wsp_ggml_context * ctx,
         struct wsp_ggml_tensor  * a,
-        struct wsp_ggml_tensor  * b) {
-    return wsp_ggml_soft_max_back_impl(ctx, a, b, true);
+        struct wsp_ggml_tensor  * b,
+        float                 scale,
+        float                 max_bias) {
+    return wsp_ggml_soft_max_ext_back_impl(ctx, a, b, scale, max_bias, true);
 }
 // wsp_ggml_rope
@@ -3505,15 +3525,18 @@ static struct wsp_ggml_tensor * wsp_ggml_rope_impl(
         WSP_GGML_ASSERT(c->ne[0] >= n_dims / 2);
     }
+    int sections[4] = {0, 0, 0, 0};
     struct wsp_ggml_tensor * result = inplace ? wsp_ggml_view_tensor(ctx, a) : wsp_ggml_dup_tensor(ctx, a);
-    int32_t params[11] = { /*n_past*/ 0, n_dims, mode, /*n_ctx*/ 0, n_ctx_orig };
+    int32_t params[15] = { /*n_past*/ 0, n_dims, mode, /*n_ctx*/ 0, n_ctx_orig };
     memcpy(params +  5, &freq_base,    sizeof(float));
     memcpy(params +  6, &freq_scale,   sizeof(float));
     memcpy(params +  7, &ext_factor,   sizeof(float));
     memcpy(params +  8, &attn_factor,  sizeof(float));
     memcpy(params +  9, &beta_fast,    sizeof(float));
     memcpy(params + 10, &beta_slow,    sizeof(float));
+    memcpy(params + 11, &sections,     sizeof(int)*4);
     wsp_ggml_set_op_params(result, params, sizeof(params));
     result->op     = WSP_GGML_OP_ROPE;
@@ -3535,6 +3558,53 @@ struct wsp_ggml_tensor * wsp_ggml_rope(
     );
 }
+struct wsp_ggml_tensor * wsp_ggml_rope_multi(
+        struct wsp_ggml_context * ctx,
+        struct wsp_ggml_tensor  * a,
+        struct wsp_ggml_tensor  * b,
+        struct wsp_ggml_tensor  * c,
+        int                   n_dims,
+        int                   sections[4],
+        int                   mode,
+        int                   n_ctx_orig,
+        float                 freq_base,
+        float                 freq_scale,
+        float                 ext_factor,
+        float                 attn_factor,
+        float                 beta_fast,
+        float                 beta_slow) {
+    // Multimodal Rotary Position Embedding
+    WSP_GGML_ASSERT((mode & 1) == 0 && "mode & 1 == 1 is no longer supported");
+    WSP_GGML_ASSERT(wsp_ggml_is_vector(b));
+    WSP_GGML_ASSERT(b->type == WSP_GGML_TYPE_I32);
+    WSP_GGML_ASSERT(a->ne[2] * 4 == b->ne[0]); // mrope expecting 4 position ids per token
+    if (c) {
+        WSP_GGML_ASSERT(c->type == WSP_GGML_TYPE_F32);
+        WSP_GGML_ASSERT(c->ne[0] >= n_dims / 2);
+    }
+    struct wsp_ggml_tensor * result = wsp_ggml_dup_tensor(ctx, a);
+    int32_t params[11 + 4] = { /*n_past*/ 0, n_dims, mode, /*n_ctx*/ 0, n_ctx_orig };
+    memcpy(params +  5, &freq_base,    sizeof(float));
+    memcpy(params +  6, &freq_scale,   sizeof(float));
+    memcpy(params +  7, &ext_factor,   sizeof(float));
+    memcpy(params +  8, &attn_factor,  sizeof(float));
+    memcpy(params +  9, &beta_fast,    sizeof(float));
+    memcpy(params + 10, &beta_slow,    sizeof(float));
+    memcpy(&params[11], sections,      sizeof(int)*4);
+    wsp_ggml_set_op_params(result, params, sizeof(params));
+    result->op   = WSP_GGML_OP_ROPE;
+    result->src[0] = a;
+    result->src[1] = b;
+    result->src[2] = c;
+    return result;
+}
 struct wsp_ggml_tensor * wsp_ggml_rope_inplace(
         struct wsp_ggml_context * ctx,
         struct wsp_ggml_tensor  * a,
@@ -3642,7 +3712,7 @@ void wsp_ggml_rope_yarn_corr_dims(
 // wsp_ggml_rope_back
-struct wsp_ggml_tensor * wsp_ggml_rope_back(
+struct wsp_ggml_tensor * wsp_ggml_rope_ext_back(
         struct wsp_ggml_context * ctx,
         struct wsp_ggml_tensor  * a,
         struct wsp_ggml_tensor  * b,
@@ -3656,29 +3726,32 @@ struct wsp_ggml_tensor * wsp_ggml_rope_back(
         float                 attn_factor,
         float                 beta_fast,
         float                 beta_slow) {
-    WSP_GGML_ASSERT(wsp_ggml_is_vector(b));
-    WSP_GGML_ASSERT(b->type == WSP_GGML_TYPE_I32);
-    WSP_GGML_ASSERT(a->ne[2] == b->ne[0]);
-    struct wsp_ggml_tensor * result = wsp_ggml_dup_tensor(ctx, a);
-    int32_t params[11] = { /*n_past*/ 0, n_dims, mode, /*n_ctx*/ 0, n_ctx_orig };
-    memcpy(params +  5, &freq_base,    sizeof(float));
-    memcpy(params +  6, &freq_scale,   sizeof(float));
-    memcpy(params +  7, &ext_factor,   sizeof(float));
-    memcpy(params +  8, &attn_factor,  sizeof(float));
-    memcpy(params +  9, &beta_fast,    sizeof(float));
-    memcpy(params + 10, &beta_slow,    sizeof(float));
-    wsp_ggml_set_op_params(result, params, sizeof(params));
-    result->op     = WSP_GGML_OP_ROPE_BACK;
-    result->src[0] = a;
-    result->src[1] = b;
-    result->src[2] = c;
+    struct wsp_ggml_tensor * result = wsp_ggml_rope_ext(
+        ctx, a, b, c, n_dims, mode, n_ctx_orig, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow);
+    result->op = WSP_GGML_OP_ROPE_BACK;
     return result;
 }
+struct wsp_ggml_tensor * wsp_ggml_rope_multi_back(
+        struct wsp_ggml_context * ctx,
+        struct wsp_ggml_tensor  * a,
+        struct wsp_ggml_tensor  * b,
+        struct wsp_ggml_tensor  * c,
+        int                   n_dims,
+        int                   sections[4],
+        int                   mode,
+        int                   n_ctx_orig,
+        float                 freq_base,
+        float                 freq_scale,
+        float                 ext_factor,
+        float                 attn_factor,
+        float                 beta_fast,
+        float                 beta_slow) {
+    struct wsp_ggml_tensor * result = wsp_ggml_rope_multi(
+        ctx, a, b, c, n_dims, sections, mode, n_ctx_orig, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow);
+    result->op = WSP_GGML_OP_ROPE_BACK;
+    return result;
+}
 // wsp_ggml_clamp
 struct wsp_ggml_tensor * wsp_ggml_clamp(
@@ -3698,104 +3771,10 @@ struct wsp_ggml_tensor * wsp_ggml_clamp(
     return result;
 }
-// wsp_ggml_conv_1d
 static int64_t wsp_ggml_calc_conv_output_size(int64_t ins, int64_t ks, int s, int p, int d) {
     return (ins + 2 * p - d * (ks - 1) - 1) / s + 1;
 }
-WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_conv_1d(
-        struct wsp_ggml_context * ctx,
-        struct wsp_ggml_tensor  * a,
-        struct wsp_ggml_tensor  * b,
-        int                   s0,
-        int                   p0,
-        int                   d0) {
-    struct wsp_ggml_tensor * im2col = wsp_ggml_im2col(ctx, a, b, s0, 0, p0, 0, d0, 0, false, WSP_GGML_TYPE_F16); // [N, OL, IC * K]
-    struct wsp_ggml_tensor * result =
-        wsp_ggml_mul_mat(ctx,
-                wsp_ggml_reshape_2d(ctx, im2col, im2col->ne[0], (im2col->ne[2] * im2col->ne[1])), // [N, OL, IC * K] => [N*OL, IC * K]
-                wsp_ggml_reshape_2d(ctx, a, (a->ne[0] * a->ne[1]), a->ne[2]));                    // [OC，IC, K] => [OC, IC * K]
-    result = wsp_ggml_reshape_3d(ctx, result, im2col->ne[1], a->ne[2], im2col->ne[2]); // [N, OC, OL]
-    return result;
-}
-// wsp_ggml_conv_1d_ph
-struct wsp_ggml_tensor* wsp_ggml_conv_1d_ph(
-        struct wsp_ggml_context * ctx,
-        struct wsp_ggml_tensor  * a,
-        struct wsp_ggml_tensor  * b,
-        int                   s,
-        int                   d) {
-    return wsp_ggml_conv_1d(ctx, a, b, s, a->ne[0] / 2, d);
-}
-// wsp_ggml_conv_transpose_1d
-static int64_t wsp_ggml_calc_conv_transpose_1d_output_size(int64_t ins, int64_t ks, int s, int p, int d) {
-    return (ins - 1) * s - 2 * p + d * (ks - 1) + 1;
-}
-WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_conv_transpose_1d(
-        struct wsp_ggml_context * ctx,
-        struct wsp_ggml_tensor  * a,
-        struct wsp_ggml_tensor  * b,
-        int                   s0,
-        int                   p0,
-        int                   d0) {
-    WSP_GGML_ASSERT(wsp_ggml_is_matrix(b));
-    WSP_GGML_ASSERT(a->ne[2] == b->ne[1]);
-    WSP_GGML_ASSERT(a->ne[3] == 1);
-    WSP_GGML_ASSERT(p0 == 0);
-    WSP_GGML_ASSERT(d0 == 1);
-    const int64_t ne[4] = {
-        wsp_ggml_calc_conv_transpose_1d_output_size(b->ne[0], a->ne[0], s0, 0 /*p0*/, 1 /*d0*/),
-        a->ne[1], b->ne[2], 1,
-    };
-    struct wsp_ggml_tensor * result = wsp_ggml_new_tensor(ctx, WSP_GGML_TYPE_F32, 4, ne);
-    int32_t params[] = { s0, p0, d0 };
-    wsp_ggml_set_op_params(result, params, sizeof(params));
-    result->op     = WSP_GGML_OP_CONV_TRANSPOSE_1D;
-    result->src[0] = a;
-    result->src[1] = b;
-    return result;
-}
-// wsp_ggml_conv_depthwise
-struct wsp_ggml_tensor * wsp_ggml_conv_depthwise_2d(
-        struct wsp_ggml_context * ctx,
-        struct wsp_ggml_tensor  * a,
-        struct wsp_ggml_tensor  * b,
-        int                   s0,
-        int                   s1,
-        int                   p0,
-        int                   p1,
-        int                   d0,
-        int                   d1) {
-    struct wsp_ggml_tensor * new_a = wsp_ggml_reshape_4d(ctx, a, a->ne[0], a->ne[1], 1, a->ne[2] * a->ne[3]);
-    struct wsp_ggml_tensor * im2col = wsp_ggml_im2col(ctx, new_a,
-                                        wsp_ggml_reshape_4d(ctx, b, b->ne[0], b->ne[1], 1, b->ne[2] * b->ne[3]),
-                                        s0, s1, p0, p1, d0, d1, true, WSP_GGML_TYPE_F16); // [N * IC, OH, OW, KH * KW]
-    struct wsp_ggml_tensor * new_b = wsp_ggml_reshape_4d(ctx, im2col, im2col->ne[0], im2col->ne[2] * im2col->ne[1], b->ne[2], b->ne[3]); // [N * IC, OH, OW, KH * KW] => [N, IC, OH * OW, KH * KW]
-    new_a = wsp_ggml_reshape_4d(ctx, new_a, (new_a->ne[0] * new_a->ne[1]), new_a->ne[2],  new_a->ne[3], 1);                       // [OC，1, KH, KW] => [1, OC, 1, KH * KW]
-    struct wsp_ggml_tensor * result = wsp_ggml_mul_mat(ctx, new_a, new_b);
-    result = wsp_ggml_reshape_4d(ctx, result, im2col->ne[1], im2col->ne[2], b->ne[2], b->ne[3]); // [N, OC, OH, OW]
-    return result;
-}
-// wsp_ggml_conv_2d
 // im2col: [N, IC, IH, IW] => [N, OH, OW, IC*KH*KW]
 // a: [OC，IC, KH, KW]
 // b: [N, IC, IH, IW]
@@ -3812,10 +3791,11 @@ struct wsp_ggml_tensor * wsp_ggml_im2col(
         int                   d1,
         bool                  is_2D,
         enum wsp_ggml_type        dst_type) {
-    if(is_2D) {
+    if (is_2D) {
         WSP_GGML_ASSERT(a->ne[2] == b->ne[2]);
     } else {
-        WSP_GGML_ASSERT(a->ne[1] == b->ne[1]);
+        //WSP_GGML_ASSERT(b->ne[1] % a->ne[1] == 0);
+        WSP_GGML_ASSERT(b->ne[1] == a->ne[1]);
         WSP_GGML_ASSERT(b->ne[3] == 1);
     }
@@ -3866,51 +3846,178 @@ struct wsp_ggml_tensor * wsp_ggml_im2col_back(
     return result;
 }
-// a: [OC，IC, KH, KW]
-// b: [N, IC, IH, IW]
-// result: [N, OC, OH, OW]
-struct wsp_ggml_tensor * wsp_ggml_conv_2d(
+// wsp_ggml_conv_1d
+struct wsp_ggml_tensor * wsp_ggml_conv_1d(
         struct wsp_ggml_context * ctx,
         struct wsp_ggml_tensor  * a,
         struct wsp_ggml_tensor  * b,
         int                   s0,
-        int                   s1,
         int                   p0,
-        int                   p1,
-        int                   d0,
-        int                   d1) {
-    struct wsp_ggml_tensor * im2col = wsp_ggml_im2col(ctx, a, b, s0, s1, p0, p1, d0, d1, true, a->type); // [N, OH, OW, IC * KH * KW]
+        int                   d0) {
+    struct wsp_ggml_tensor * im2col = wsp_ggml_im2col(ctx, a, b, s0, 0, p0, 0, d0, 0, false, WSP_GGML_TYPE_F16); // [N, OL, IC * K]
     struct wsp_ggml_tensor * result =
         wsp_ggml_mul_mat(ctx,
-                wsp_ggml_reshape_2d(ctx, im2col, im2col->ne[0],  im2col->ne[3] * im2col->ne[2] * im2col->ne[1]), // [N, OH, OW, IC * KH * KW] => [N*OH*OW, IC * KH * KW]
-                wsp_ggml_reshape_2d(ctx, a, (a->ne[0] * a->ne[1] * a->ne[2]),  a->ne[3]));                       // [OC，IC, KH, KW] => [OC, IC * KH * KW]
-    result = wsp_ggml_reshape_4d(ctx, result, im2col->ne[1], im2col->ne[2], im2col->ne[3], a->ne[3]); // [OC, N, OH, OW]
-    result = wsp_ggml_cont(ctx, wsp_ggml_permute(ctx, result, 0, 1, 3, 2)); // [N, OC, OH, OW]
+                wsp_ggml_reshape_2d(ctx, im2col, im2col->ne[0], (im2col->ne[2] * im2col->ne[1])), // [N, OL, IC * K] => [N*OL, IC * K]
+                wsp_ggml_reshape_2d(ctx, a, (a->ne[0] * a->ne[1]), a->ne[2]));                    // [OC，IC, K] => [OC, IC * K]
+    result = wsp_ggml_reshape_3d(ctx, result, im2col->ne[1], a->ne[2], im2col->ne[2]); // [N, OC, OL]
     return result;
 }
-// wsp_ggml_conv_2d_sk_p0
+// wsp_ggml_conv_1d_ph
-struct wsp_ggml_tensor * wsp_ggml_conv_2d_sk_p0(
+struct wsp_ggml_tensor* wsp_ggml_conv_1d_ph(
         struct wsp_ggml_context * ctx,
         struct wsp_ggml_tensor  * a,
-        struct wsp_ggml_tensor  * b) {
-    return wsp_ggml_conv_2d(ctx, a, b, a->ne[0], a->ne[1], 0, 0, 1, 1);
+        struct wsp_ggml_tensor  * b,
+        int                   s,
+        int                   d) {
+    return wsp_ggml_conv_1d(ctx, a, b, s, a->ne[0] / 2, d);
 }
-// wsp_ggml_conv_2d_s1_ph
+// wsp_ggml_conv_1d_dw
-struct wsp_ggml_tensor * wsp_ggml_conv_2d_s1_ph(
+struct wsp_ggml_tensor * wsp_ggml_conv_1d_dw(
         struct wsp_ggml_context * ctx,
         struct wsp_ggml_tensor  * a,
-        struct wsp_ggml_tensor  * b) {
-    return wsp_ggml_conv_2d(ctx, a, b, 1, 1, a->ne[0] / 2, a->ne[1] / 2, 1, 1);
-}
+        struct wsp_ggml_tensor  * b,
+        int                   s0,
+        int                   p0,
+        int                   d0) {
+    struct wsp_ggml_tensor * new_a = wsp_ggml_reshape_4d(ctx, a, a->ne[0], 1, a->ne[1], a->ne[2]);
+    struct wsp_ggml_tensor * new_b = wsp_ggml_reshape_4d(ctx, b, b->ne[0], 1, b->ne[1], b->ne[2]);
+    struct wsp_ggml_tensor * im2col = wsp_ggml_im2col(ctx, new_a, new_b, s0, 0, p0, 0, d0, 0, false, WSP_GGML_TYPE_F16);
+    struct wsp_ggml_tensor * result = wsp_ggml_mul_mat(ctx, im2col, a);
+    result = wsp_ggml_reshape_3d(ctx, result, b->ne[0], b->ne[1], 1);
+    return result;
+}
+// wsp_ggml_conv_1d_dw_ph
+struct wsp_ggml_tensor * wsp_ggml_conv_1d_dw_ph(
+        struct wsp_ggml_context * ctx,
+        struct wsp_ggml_tensor  * a,
+        struct wsp_ggml_tensor  * b,
+        int                   s0,
+        int                   d0) {
+    return wsp_ggml_conv_1d_dw(ctx, a, b, s0, a->ne[0] / 2, d0);
+}
+// wsp_ggml_conv_transpose_1d
+static int64_t wsp_ggml_calc_conv_transpose_1d_output_size(int64_t ins, int64_t ks, int s, int p, int d) {
+    return (ins - 1) * s - 2 * p + d * (ks - 1) + 1;
+}
+WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_conv_transpose_1d(
+        struct wsp_ggml_context * ctx,
+        struct wsp_ggml_tensor  * a,
+        struct wsp_ggml_tensor  * b,
+        int                   s0,
+        int                   p0,
+        int                   d0) {
+    WSP_GGML_ASSERT(wsp_ggml_is_matrix(b));
+    WSP_GGML_ASSERT(a->ne[2] == b->ne[1]);
+    WSP_GGML_ASSERT(a->ne[3] == 1);
+    WSP_GGML_ASSERT(p0 == 0);
+    WSP_GGML_ASSERT(d0 == 1);
+    const int64_t ne[4] = {
+        wsp_ggml_calc_conv_transpose_1d_output_size(b->ne[0], a->ne[0], s0, 0 /*p0*/, 1 /*d0*/),
+        a->ne[1], b->ne[2], 1,
+    };
+    struct wsp_ggml_tensor * result = wsp_ggml_new_tensor(ctx, WSP_GGML_TYPE_F32, 4, ne);
+    int32_t params[] = { s0, p0, d0 };
+    wsp_ggml_set_op_params(result, params, sizeof(params));
+    result->op     = WSP_GGML_OP_CONV_TRANSPOSE_1D;
+    result->src[0] = a;
+    result->src[1] = b;
+    return result;
+}
+// wsp_ggml_conv_2d
+// a: [OC，IC, KH, KW]
+// b: [N, IC, IH, IW]
+// result: [N, OC, OH, OW]
+struct wsp_ggml_tensor * wsp_ggml_conv_2d(
+        struct wsp_ggml_context * ctx,
+        struct wsp_ggml_tensor  * a,
+        struct wsp_ggml_tensor  * b,
+        int                   s0,
+        int                   s1,
+        int                   p0,
+        int                   p1,
+        int                   d0,
+        int                   d1) {
+    struct wsp_ggml_tensor * im2col = wsp_ggml_im2col(ctx, a, b, s0, s1, p0, p1, d0, d1, true, a->type); // [N, OH, OW, IC * KH * KW]
+    struct wsp_ggml_tensor * result =
+        wsp_ggml_mul_mat(ctx,
+                wsp_ggml_reshape_2d(ctx, im2col, im2col->ne[0],  im2col->ne[3] * im2col->ne[2] * im2col->ne[1]), // [N, OH, OW, IC * KH * KW] => [N*OH*OW, IC * KH * KW]
+                wsp_ggml_reshape_2d(ctx, a, (a->ne[0] * a->ne[1] * a->ne[2]),  a->ne[3]));                       // [OC，IC, KH, KW] => [OC, IC * KH * KW]
+    result = wsp_ggml_reshape_4d(ctx, result, im2col->ne[1], im2col->ne[2], im2col->ne[3], a->ne[3]); // [OC, N, OH, OW]
+    result = wsp_ggml_cont(ctx, wsp_ggml_permute(ctx, result, 0, 1, 3, 2)); // [N, OC, OH, OW]
+    return result;
+}
+// wsp_ggml_conv_2d_sk_p0
+struct wsp_ggml_tensor * wsp_ggml_conv_2d_sk_p0(
+        struct wsp_ggml_context * ctx,
+        struct wsp_ggml_tensor  * a,
+        struct wsp_ggml_tensor  * b) {
+    return wsp_ggml_conv_2d(ctx, a, b, a->ne[0], a->ne[1], 0, 0, 1, 1);
+}
+// wsp_ggml_conv_2d_s1_ph
+struct wsp_ggml_tensor * wsp_ggml_conv_2d_s1_ph(
+        struct wsp_ggml_context * ctx,
+        struct wsp_ggml_tensor  * a,
+        struct wsp_ggml_tensor  * b) {
+    return wsp_ggml_conv_2d(ctx, a, b, 1, 1, a->ne[0] / 2, a->ne[1] / 2, 1, 1);
+}
+// wsp_ggml_conv_2d_dw
+struct wsp_ggml_tensor * wsp_ggml_conv_2d_dw(
+        struct wsp_ggml_context * ctx,
+        struct wsp_ggml_tensor  * a,
+        struct wsp_ggml_tensor  * b,
+        int                   s0,
+        int                   s1,
+        int                   p0,
+        int                   p1,
+        int                   d0,
+        int                   d1) {
+    struct wsp_ggml_tensor * new_a = wsp_ggml_reshape_4d(ctx, a, a->ne[0], a->ne[1], 1, a->ne[2] * a->ne[3]);
+    struct wsp_ggml_tensor * im2col = wsp_ggml_im2col(ctx, new_a,
+                                        wsp_ggml_reshape_4d(ctx, b, b->ne[0], b->ne[1], 1, b->ne[2] * b->ne[3]),
+                                        s0, s1, p0, p1, d0, d1, true, WSP_GGML_TYPE_F16); // [N * IC, OH, OW, KH * KW]
+    struct wsp_ggml_tensor * new_b = wsp_ggml_reshape_4d(ctx, im2col, im2col->ne[0], im2col->ne[2] * im2col->ne[1], b->ne[2], b->ne[3]); // [N * IC, OH, OW, KH * KW] => [N, IC, OH * OW, KH * KW]
+    new_a = wsp_ggml_reshape_4d(ctx, new_a, (new_a->ne[0] * new_a->ne[1]), new_a->ne[2],  new_a->ne[3], 1);                       // [OC，1, KH, KW] => [1, OC, 1, KH * KW]
+    struct wsp_ggml_tensor * result = wsp_ggml_mul_mat(ctx, new_a, new_b);
+    result = wsp_ggml_reshape_4d(ctx, result, im2col->ne[1], im2col->ne[2], b->ne[2], b->ne[3]); // [N, OC, OH, OW]
+    return result;
+}
 // wsp_ggml_conv_transpose_2d_p0
 static int64_t wsp_ggml_calc_conv_transpose_output_size(int64_t ins, int64_t ks, int s, int p) {
@@ -4087,6 +4194,37 @@ struct wsp_ggml_tensor * wsp_ggml_pad(
     return result;
 }
+// wsp_ggml_pad_reflect_1d
+struct wsp_ggml_tensor * wsp_ggml_pad_reflect_1d(
+        struct wsp_ggml_context * ctx,
+        struct wsp_ggml_tensor  * a,
+        int                   p0,
+        int                   p1) {
+    WSP_GGML_ASSERT(p0 >= 0);
+    WSP_GGML_ASSERT(p1 >= 0);
+    WSP_GGML_ASSERT(p0 < a->ne[0]); // padding length on each size must be less than the
+    WSP_GGML_ASSERT(p1 < a->ne[0]); // existing length of the dimension being padded
+    WSP_GGML_ASSERT(wsp_ggml_is_contiguous(a));
+    WSP_GGML_ASSERT(a->type == WSP_GGML_TYPE_F32);
+    struct wsp_ggml_tensor * result = wsp_ggml_new_tensor_4d(ctx, a->type,
+            a->ne[0] + p0 + p1,
+            a->ne[1],
+            a->ne[2],
+            a->ne[3]);
+    int32_t params[] = { p0, p1 };
+    wsp_ggml_set_op_params(result, params, sizeof(params));
+    result->op     = WSP_GGML_OP_PAD_REFLECT_1D;
+    result->src[0] = a;
+    return result;
+}
 // wsp_ggml_arange
 struct wsp_ggml_tensor * wsp_ggml_arange(
@@ -4138,6 +4276,7 @@ struct wsp_ggml_tensor * wsp_ggml_argsort(
         struct wsp_ggml_context  * ctx,
         struct wsp_ggml_tensor   * a,
         enum wsp_ggml_sort_order   order) {
+    WSP_GGML_ASSERT(a->ne[0] <= INT32_MAX);
     struct wsp_ggml_tensor * result = wsp_ggml_new_tensor(ctx, WSP_GGML_TYPE_I32, WSP_GGML_MAX_DIMS, a->ne);
     wsp_ggml_set_op_params_i32(result, 0, (int32_t) order);
@@ -4512,15 +4651,13 @@ struct wsp_ggml_tensor * wsp_ggml_rwkv_wkv6(
     WSP_GGML_ASSERT(wsp_ggml_is_contiguous(state));
     const int64_t S = k->ne[0];
-    const int64_t H = k->ne[2];
-    const int64_t n_tokens = k->ne[3];
+    const int64_t H = k->ne[1];
+    const int64_t n_tokens = k->ne[2];
     const int64_t n_seqs = state->ne[1];
     {
-        WSP_GGML_ASSERT(k->ne[1] == 1);
-        WSP_GGML_ASSERT(v->ne[0] == 1 && v->ne[1] == S && v->ne[2] == H && v->ne[3] == n_tokens);
-        WSP_GGML_ASSERT(r->ne[0] == 1 && r->ne[1] == S && r->ne[2] == H && r->ne[3] == n_tokens);
-        // TODO: RWKV v4 and v5
-        WSP_GGML_ASSERT(td->ne[0] == 1 && td->ne[1] == S && td->ne[2] == H && td->ne[3] == n_tokens);
+        WSP_GGML_ASSERT(v->ne[0] == S && v->ne[1] == H && v->ne[2] == n_tokens);
+        WSP_GGML_ASSERT(r->ne[0] == S && r->ne[1] == H && r->ne[2] == n_tokens);
+        WSP_GGML_ASSERT(td->ne[0] == S && td->ne[1] == H && td->ne[2] == n_tokens);
         WSP_GGML_ASSERT(wsp_ggml_nelements(state) == S * S * H * n_seqs);
     }
@@ -4539,6 +4676,49 @@ struct wsp_ggml_tensor * wsp_ggml_rwkv_wkv6(
     return result;
 }
+// wsp_ggml_gated_linear_attn
+struct wsp_ggml_tensor * wsp_ggml_gated_linear_attn(
+        struct wsp_ggml_context * ctx,
+        struct wsp_ggml_tensor  * k,
+        struct wsp_ggml_tensor  * v,
+        struct wsp_ggml_tensor  * q,
+        struct wsp_ggml_tensor  * g,
+        struct wsp_ggml_tensor  * state,
+        float scale) {
+    WSP_GGML_ASSERT(wsp_ggml_is_contiguous(k));
+    WSP_GGML_ASSERT(wsp_ggml_is_contiguous(v));
+    WSP_GGML_ASSERT(wsp_ggml_is_contiguous(q));
+    WSP_GGML_ASSERT(wsp_ggml_is_contiguous(g));
+    WSP_GGML_ASSERT(wsp_ggml_is_contiguous(state));
+    const int64_t S = k->ne[0];
+    const int64_t H = k->ne[1];
+    const int64_t n_tokens = k->ne[2];
+    const int64_t n_seqs = state->ne[1];
+    {
+        WSP_GGML_ASSERT(v->ne[0] == S && v->ne[1] == H && v->ne[2] == n_tokens);
+        WSP_GGML_ASSERT(q->ne[0] == S && q->ne[1] == H && q->ne[2] == n_tokens);
+        WSP_GGML_ASSERT(g->ne[0] == S && g->ne[1] == H && g->ne[2] == n_tokens);
+        WSP_GGML_ASSERT(wsp_ggml_nelements(state) == S * S * H * n_seqs);
+    }
+    // concat output and new_state
+    const int64_t ne[4] = { S * H, n_tokens + S * n_seqs, 1, 1 };
+    struct wsp_ggml_tensor * result = wsp_ggml_new_tensor(ctx, WSP_GGML_TYPE_F32, 4, ne);
+    wsp_ggml_set_op_params_f32(result, 0, scale);
+    result->op     = WSP_GGML_OP_GATED_LINEAR_ATTN;
+    result->src[0] = k;
+    result->src[1] = v;
+    result->src[2] = q;
+    result->src[3] = g;
+    result->src[4] = state;
+    return result;
+}
 // wsp_ggml_unary
 static struct wsp_ggml_tensor * wsp_ggml_unary_impl(
@@ -4913,10 +5093,10 @@ struct wsp_ggml_tensor * wsp_ggml_cross_entropy_loss_back(
         struct wsp_ggml_tensor  * a,
         struct wsp_ggml_tensor  * b,
         struct wsp_ggml_tensor  * c) {
-    WSP_GGML_ASSERT(wsp_ggml_are_same_shape(a, b));
-    WSP_GGML_ASSERT(wsp_ggml_is_scalar(c));
+    WSP_GGML_ASSERT(wsp_ggml_is_scalar(a));
+    WSP_GGML_ASSERT(wsp_ggml_are_same_shape(b, c));
-    struct wsp_ggml_tensor * result = wsp_ggml_dup_tensor(ctx, a);
+    struct wsp_ggml_tensor * result = wsp_ggml_dup_tensor(ctx, b);
     result->op     = WSP_GGML_OP_CROSS_ENTROPY_LOSS_BACK;
     result->src[0] = a;
@@ -5019,8 +5199,10 @@ static void wsp_ggml_hash_map_free(struct hash_map * map) {
 }
 // utility functions to change gradients
-// if a is in acc_table, modify gradients in-place and mark result as gradient accumulator
-// else if a is in zero_table, replace a
+// isrc is the index of tensor in cgraph->visited_has_set.keys
+// the corresponding gradient (accumulators) are also at position isrc
+// if tensor has a gradient accumulator, modify that accumulator in-place
+// else if there is no gradient for tensor, set the corresponding value
 // else, just add/subtract/etc. the gradients
 static void wsp_ggml_add_or_set(
@@ -5028,11 +5210,14 @@ static void wsp_ggml_add_or_set(
         struct wsp_ggml_cgraph  * cgraph,
         size_t                isrc,
         struct wsp_ggml_tensor  * tensor) {
+    struct wsp_ggml_tensor * src = cgraph->visited_hash_set.keys[isrc];
+    WSP_GGML_ASSERT(src);
     if (cgraph->grads[isrc]) {
-        cgraph->grads[isrc] = wsp_ggml_add_impl(ctx, cgraph->grads[isrc], tensor, cgraph->grad_accs[isrc]);
+        cgraph->grads[isrc] = wsp_ggml_add_impl(ctx, cgraph->grads[isrc], tensor, /*inplace =*/ cgraph->grad_accs[isrc]);
     } else {
         cgraph->grads[isrc] = tensor;
     }
+    wsp_ggml_format_name(cgraph->grads[isrc], "grad for %s", src->name);
     wsp_ggml_build_forward_expand(cgraph, cgraph->grads[isrc]);
 }
@@ -5040,18 +5225,20 @@ static void wsp_ggml_acc_or_set(
         struct wsp_ggml_context * ctx,
         struct wsp_ggml_cgraph  * cgraph,
         size_t                isrc,
-        struct wsp_ggml_tensor  * src,
         struct wsp_ggml_tensor  * tensor,
         const  size_t         nb1,
         const  size_t         nb2,
         const  size_t         nb3,
         const  size_t         offset) {
+    struct wsp_ggml_tensor * src = cgraph->visited_hash_set.keys[isrc];
+    WSP_GGML_ASSERT(src);
     if (cgraph->grads[isrc]) {
         cgraph->grads[isrc] = wsp_ggml_acc_impl(ctx, cgraph->grads[isrc], tensor, nb1, nb2, nb3, offset, cgraph->grad_accs[isrc]);
     } else {
         struct wsp_ggml_tensor * a_zero = wsp_ggml_scale(ctx, src, 0.0f); // FIXME this is going to produce NaN if a contains inf/NaN
         cgraph->grads[isrc] = wsp_ggml_acc_impl(ctx, a_zero, tensor, nb1, nb2, nb3, offset, false);
     }
+    wsp_ggml_format_name(cgraph->grads[isrc], "grad for %s", cgraph->visited_hash_set.keys[isrc]->name);
     wsp_ggml_build_forward_expand(cgraph, cgraph->grads[isrc]);
 }
@@ -5059,13 +5246,15 @@ static void wsp_ggml_add1_or_set(
         struct wsp_ggml_context * ctx,
         struct wsp_ggml_cgraph  * cgraph,
         size_t                isrc,
-        struct wsp_ggml_tensor  * src,
         struct wsp_ggml_tensor  * tensor) {
+    struct wsp_ggml_tensor * src = cgraph->visited_hash_set.keys[isrc];
+    WSP_GGML_ASSERT(src);
     if (cgraph->grads[isrc]) {
         cgraph->grads[isrc] = wsp_ggml_add1_impl(ctx, cgraph->grads[isrc], tensor, cgraph->grad_accs[isrc]);
     } else {
         cgraph->grads[isrc] = wsp_ggml_repeat(ctx, tensor, src);
     }
+    wsp_ggml_format_name(cgraph->grads[isrc], "grad for %s", src->name);
     wsp_ggml_build_forward_expand(cgraph, cgraph->grads[isrc]);
 }
@@ -5074,16 +5263,19 @@ static void wsp_ggml_sub_or_set(
         struct wsp_ggml_cgraph  * cgraph,
         size_t                isrc,
         struct wsp_ggml_tensor  * tensor) {
+    struct wsp_ggml_tensor * src = cgraph->visited_hash_set.keys[isrc];
+    WSP_GGML_ASSERT(src);
     if (cgraph->grads[isrc]) {
         cgraph->grads[isrc] = wsp_ggml_sub_impl(ctx, cgraph->grads[isrc], tensor, cgraph->grad_accs[isrc]);
     } else {
         cgraph->grads[isrc] = wsp_ggml_neg(ctx, tensor);
     }
+    wsp_ggml_format_name(cgraph->grads[isrc], "grad for %s", src->name);
     wsp_ggml_build_forward_expand(cgraph, cgraph->grads[isrc]);
 }
 static void wsp_ggml_compute_backward(
-        struct wsp_ggml_context * ctx, struct wsp_ggml_cgraph * cgraph, int i, bool * grads_needed) {
+        struct wsp_ggml_context * ctx, struct wsp_ggml_cgraph * cgraph, int i, const bool * grads_needed) {
     struct wsp_ggml_tensor * tensor = cgraph->nodes[i];
     struct wsp_ggml_tensor * grad   = wsp_ggml_graph_get_grad(cgraph, tensor);
@@ -5095,12 +5287,12 @@ static void wsp_ggml_compute_backward(
     struct wsp_ggml_tensor * src1 = tensor->src[1];
     struct wsp_ggml_tensor * src2 = tensor->src[2];
     struct wsp_ggml_hash_set * hash_set = &cgraph->visited_hash_set;
-    const size_t isrc0 = wsp_ggml_hash_find(hash_set, src0);
-    const size_t isrc1 = wsp_ggml_hash_find(hash_set, src1);
-    const size_t isrc2 = wsp_ggml_hash_find(hash_set, src2);
-    const bool src0_needs_grads = isrc0 != WSP_GGML_HASHSET_FULL && wsp_ggml_bitset_get(hash_set->used, isrc0) && grads_needed[isrc0];
-    const bool src1_needs_grads = isrc1 != WSP_GGML_HASHSET_FULL && wsp_ggml_bitset_get(hash_set->used, isrc1) && grads_needed[isrc1];
-    const bool src2_needs_grads = isrc2 != WSP_GGML_HASHSET_FULL && wsp_ggml_bitset_get(hash_set->used, isrc2) && grads_needed[isrc2];
+    const size_t isrc0 = src0 ? wsp_ggml_hash_find(hash_set, src0) : (size_t) -1;
+    const size_t isrc1 = src1 ? wsp_ggml_hash_find(hash_set, src1) : (size_t) -1;
+    const size_t isrc2 = src2 ? wsp_ggml_hash_find(hash_set, src2) : (size_t) -1;
+    const bool src0_needs_grads = src0 && isrc0 != WSP_GGML_HASHSET_FULL && wsp_ggml_bitset_get(hash_set->used, isrc0) && grads_needed[isrc0];
+    const bool src1_needs_grads = src1 && isrc1 != WSP_GGML_HASHSET_FULL && wsp_ggml_bitset_get(hash_set->used, isrc1) && grads_needed[isrc1];
+    const bool src2_needs_grads = src2 && isrc2 != WSP_GGML_HASHSET_FULL && wsp_ggml_bitset_get(hash_set->used, isrc2) && grads_needed[isrc2];
     switch (tensor->op) {
         case WSP_GGML_OP_DUP: {
@@ -5155,7 +5347,7 @@ static void wsp_ggml_compute_backward(
         } break;
         case WSP_GGML_OP_MUL: {
             if (src0_needs_grads) {
-                wsp_ggml_add_or_set(ctx, cgraph, isrc0, wsp_ggml_mul(ctx, src1, grad));
+                wsp_ggml_add_or_set(ctx, cgraph, isrc0, wsp_ggml_mul(ctx, grad, src1));
             }
             if (src1_needs_grads) {
                 struct wsp_ggml_tensor * tmp = wsp_ggml_mul(ctx, src0, grad);
@@ -5200,7 +5392,7 @@ static void wsp_ggml_compute_backward(
         } break;
         case WSP_GGML_OP_SUM: {
             if (src0_needs_grads) {
-                wsp_ggml_add1_or_set(ctx, cgraph, isrc0, src0, grad);
+                wsp_ggml_add1_or_set(ctx, cgraph, isrc0, grad);
             }
         } break;
         case WSP_GGML_OP_SUM_ROWS: {
@@ -5210,7 +5402,7 @@ static void wsp_ggml_compute_backward(
         } break;
         case WSP_GGML_OP_MEAN: {
             if (src0_needs_grads) {
-                wsp_ggml_add1_or_set(ctx, cgraph, isrc0, src0, wsp_ggml_scale_impl(ctx, grad, 1.0f/src0->ne[0], false));
+                wsp_ggml_add1_or_set(ctx, cgraph, isrc0, wsp_ggml_scale_impl(ctx, grad, 1.0f/src0->ne[0], false));
             }
         } break;
         case WSP_GGML_OP_REPEAT: {
@@ -5227,7 +5419,7 @@ static void wsp_ggml_compute_backward(
             if (src0_needs_grads) {
                 float eps;
                 memcpy(&eps, tensor->op_params, sizeof(float));
-                wsp_ggml_add_or_set(ctx, cgraph, isrc0, wsp_ggml_rms_norm_back(ctx, src0, grad, eps));
+                wsp_ggml_add_or_set(ctx, cgraph, isrc0, wsp_ggml_rms_norm_back(ctx, grad, src0, eps));
             }
         } break;
         case WSP_GGML_OP_MUL_MAT: {
@@ -5247,21 +5439,25 @@ static void wsp_ggml_compute_backward(
             // src1.shape   [n,p,qq,rr]
             if (src0_needs_grads) {
-                struct wsp_ggml_tensor * s1_tg =
+                WSP_GGML_ASSERT(grad->ne[2] == src1->ne[2]);
+                WSP_GGML_ASSERT(grad->ne[3] == src1->ne[3]);
+                struct wsp_ggml_tensor * tmp =
                     wsp_ggml_out_prod(ctx, // [n,m,qq,rr]
                         src1,          // [n,p,qq,rr]
                         grad);         // [m,p,qq,rr]
-                const int64_t qq = s1_tg->ne[2];
-                const int64_t rr = s1_tg->ne[3];
-                const int64_t q1 = src0->ne[2];
-                const int64_t r1 = src0->ne[3];
-                const bool ne2_broadcasted = qq > q1;
-                const bool ne3_broadcasted = rr > r1;
-                if (ne2_broadcasted || ne3_broadcasted) {
-                    // sum broadcast repetitions of s1_tg into shape of src0
-                    s1_tg = wsp_ggml_repeat_back(ctx, s1_tg, src0);
+                if (!wsp_ggml_are_same_shape(tmp, src0)) {
+                    WSP_GGML_ASSERT(tmp->ne[0] == src0->ne[0]);
+                    WSP_GGML_ASSERT(tmp->ne[1] == src0->ne[1]);
+                    WSP_GGML_ASSERT(tmp->ne[3] == 1);
+                    const int64_t nr2 = tmp->ne[2] / src0->ne[2];
+                    const size_t nb2 = tmp->nb[2] * nr2;
+                    const size_t nb3 = tmp->nb[2];
+                    tmp = wsp_ggml_view_4d(ctx, tmp, src0->ne[0], src0->ne[1], src0->ne[2], nr2, tmp->nb[1], nb2, nb3, 0);
+                    tmp = wsp_ggml_repeat_back(ctx, tmp, src0);
                 }
-                wsp_ggml_add_or_set(ctx, cgraph, isrc0, s1_tg /*= [n,m,q1,r1]*/);
+                wsp_ggml_add_or_set(ctx, cgraph, isrc0, tmp);
             }
             if (src1_needs_grads) {
                 wsp_ggml_add_or_set(ctx, cgraph, isrc1,
@@ -5330,7 +5526,9 @@ static void wsp_ggml_compute_backward(
             if (src0_needs_grads) {
                 WSP_GGML_ASSERT(!cgraph->grads[isrc0] || wsp_ggml_is_contiguous(cgraph->grads[isrc0]));
                 WSP_GGML_ASSERT(wsp_ggml_is_contiguous(grad));
-                wsp_ggml_add_or_set(ctx, cgraph, isrc0, grad);
+                WSP_GGML_ASSERT(wsp_ggml_nelements(tensor) == wsp_ggml_nelements(src0));
+                wsp_ggml_add_or_set(ctx, cgraph, isrc0,
+                    wsp_ggml_are_same_shape(tensor, src0) ? grad : wsp_ggml_reshape(ctx, grad, src0));
             }
         } break;
         case WSP_GGML_OP_RESHAPE: {
@@ -5363,7 +5561,7 @@ static void wsp_ggml_compute_backward(
                     nb3 = (nb3 / n0) * ng;
                 }
-                wsp_ggml_acc_or_set(ctx, cgraph, isrc0, src0, grad, nb1, nb2, nb3, offset);
+                wsp_ggml_acc_or_set(ctx, cgraph, isrc0, grad, nb1, nb2, nb3, offset);
             }
         } break;
         case WSP_GGML_OP_PERMUTE: {
@@ -5410,7 +5608,13 @@ static void wsp_ggml_compute_backward(
         } break;
         case WSP_GGML_OP_SOFT_MAX: {
             if (src0_needs_grads) {
-                wsp_ggml_add_or_set(ctx, cgraph, isrc0, wsp_ggml_soft_max_back(ctx, grad, tensor));
+                float scale    = 1.0f;
+                float max_bias = 0.0f;
+                memcpy(&scale,    (const float *) tensor->op_params + 0, sizeof(float));
+                memcpy(&max_bias, (const float *) tensor->op_params + 1, sizeof(float));
+                wsp_ggml_add_or_set(ctx, cgraph, isrc0, wsp_ggml_soft_max_ext_back(ctx, grad, tensor, scale, max_bias));
             }
             WSP_GGML_ASSERT((!src1 || !src1_needs_grads) && "backward pass for softmax mask not implemented");
         } break;
@@ -5422,6 +5626,7 @@ static void wsp_ggml_compute_backward(
                 //const int n_ctx      = ((int32_t *) tensor->op_params)[3];
                 const int n_ctx_orig = ((const int32_t *) tensor->op_params)[4];
                 float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
+                int sections[4] = {0, 0, 0, 0};
                 memcpy(&freq_base,   (const float *) tensor->op_params +  5, sizeof(float));
                 memcpy(&freq_scale,  (const float *) tensor->op_params +  6, sizeof(float));
@@ -5429,10 +5634,14 @@ static void wsp_ggml_compute_backward(
                 memcpy(&attn_factor, (const float *) tensor->op_params +  8, sizeof(float));
                 memcpy(&beta_fast,   (const float *) tensor->op_params +  9, sizeof(float));
                 memcpy(&beta_slow,   (const float *) tensor->op_params + 10, sizeof(float));
-                wsp_ggml_add_or_set(ctx, cgraph, isrc0,
-                    wsp_ggml_rope_back(ctx, grad, src1, src2, n_dims, mode, n_ctx_orig, freq_base,
-                        freq_scale, ext_factor, attn_factor, beta_fast, beta_slow));
+                memcpy(&sections,                    tensor->op_params + 11, sizeof(sections));
+                struct wsp_ggml_tensor * rope_back = grad->ne[2] == src1->ne[0] ?
+                    wsp_ggml_rope_ext_back(ctx, grad, src1, src2, n_dims,
+                        mode, n_ctx_orig, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow) :
+                    wsp_ggml_rope_multi_back(ctx, grad, src1, src2, n_dims, sections,
+                        mode, n_ctx_orig, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow);
+                wsp_ggml_add_or_set(ctx, cgraph, isrc0, rope_back);
             }
             WSP_GGML_ASSERT((!src2 || !src2_needs_grads) && "gradients for freq factors not implemented");
         } break;
@@ -5446,7 +5655,7 @@ static void wsp_ggml_compute_backward(
                 const int32_t d1    = wsp_ggml_get_op_params_i32(tensor, 5);
                 const bool    is_2D = wsp_ggml_get_op_params_i32(tensor, 6) == 1;
-                wsp_ggml_add_or_set(ctx, cgraph, isrc1, wsp_ggml_im2col_back(ctx, src0, grad, src1->ne, s0, s1, p0, p1, d0, d1, is_2D));
+                wsp_ggml_add_or_set(ctx, cgraph, isrc1, wsp_ggml_im2col_back(ctx, grad, src0, src1->ne, s0, s1, p0, p1, d0, d1, is_2D));
             }
         } break;
         case WSP_GGML_OP_POOL_2D: {
@@ -5489,7 +5698,7 @@ static void wsp_ggml_compute_backward(
                 } break;
                 case WSP_GGML_UNARY_OP_SILU: {
                     if (src0_needs_grads) {
-                        wsp_ggml_add_or_set(ctx, cgraph, isrc0, wsp_ggml_silu_back(ctx, src0, grad));
+                        wsp_ggml_add_or_set(ctx, cgraph, isrc0, wsp_ggml_silu_back(ctx, grad, src0));
                     }
                 } break;
                 case WSP_GGML_UNARY_OP_EXP: {
@@ -5506,7 +5715,7 @@ static void wsp_ggml_compute_backward(
         } break;
         case WSP_GGML_OP_CROSS_ENTROPY_LOSS: {
             if (src0_needs_grads) {
-                wsp_ggml_add_or_set(ctx, cgraph, isrc0, wsp_ggml_cross_entropy_loss_back(ctx, src0, src1, grad));
+                wsp_ggml_add_or_set(ctx, cgraph, isrc0, wsp_ggml_cross_entropy_loss_back(ctx, grad, src0, src1));
             }
             WSP_GGML_ASSERT(!src1_needs_grads && "backward pass for labels not implemented");
         } break;
@@ -5597,10 +5806,9 @@ void wsp_ggml_build_backward_expand(
     const int n_nodes_f = cgraph->n_nodes;
-    const size_t hash_size = wsp_ggml_hash_size(2*cgraph->size);
-    memset(cgraph->grads,     0, hash_size*sizeof(struct wsp_ggml_tensor *));
-    memset(cgraph->grad_accs, 0, hash_size*sizeof(struct wsp_ggml_tensor *));
-    bool * grads_needed = calloc(hash_size, sizeof(bool));
+    memset(cgraph->grads,     0, cgraph->visited_hash_set.size*sizeof(struct wsp_ggml_tensor *));
+    memset(cgraph->grad_accs, 0, cgraph->visited_hash_set.size*sizeof(struct wsp_ggml_tensor *));
+    bool * grads_needed = calloc(cgraph->visited_hash_set.size, sizeof(bool));
     {
         bool any_params = false;
@@ -5621,7 +5829,7 @@ void wsp_ggml_build_backward_expand(
             continue;
         }
-        bool node_needs_grad = node->flags & WSP_GGML_TENSOR_FLAG_PARAM;
+        bool node_needs_grad = (node->flags & WSP_GGML_TENSOR_FLAG_PARAM) || (node->flags & WSP_GGML_TENSOR_FLAG_LOSS);
         bool ignore_src[WSP_GGML_MAX_SRC] = {false};
         switch (node->op) {
             // gradients in node->src[0] for one reason or another have no effect on output gradients
@@ -5638,7 +5846,7 @@ void wsp_ggml_build_backward_expand(
             } break;
             // gradients in node->src[1] for one reason or another have no effect on output gradients
-            case WSP_GGML_OP_CPY:           // gradients in CPY target  are irrelevant
+            case WSP_GGML_OP_CPY:           // gradients in CPY target are irrelevant
             case WSP_GGML_OP_GET_ROWS:      // row indices not differentiable
             case WSP_GGML_OP_GET_ROWS_BACK: // same as for GET_ROWS
             case WSP_GGML_OP_ROPE:          // positions not differentiable
@@ -5665,9 +5873,12 @@ void wsp_ggml_build_backward_expand(
             node->op == WSP_GGML_OP_RESHAPE || node->op == WSP_GGML_OP_PERMUTE || node->op == WSP_GGML_OP_TRANSPOSE);
         const size_t igrad = wsp_ggml_hash_find(&cgraph->visited_hash_set, node);
+        WSP_GGML_ASSERT(igrad != WSP_GGML_HASHSET_FULL);
+        WSP_GGML_ASSERT(wsp_ggml_bitset_get(cgraph->visited_hash_set.used, igrad));
         if ((accumulate && (node->flags & WSP_GGML_TENSOR_FLAG_PARAM)) || (node->flags & WSP_GGML_TENSOR_FLAG_LOSS)) {
-            cgraph->grads[igrad]     = wsp_ggml_dup_tensor(ctx_static, node);
-            cgraph->grad_accs[igrad] = cgraph->grads[igrad];
+            cgraph->grad_accs[igrad] = wsp_ggml_dup_tensor(ctx_static, node);
+            cgraph->grads[igrad]     = cgraph->grad_accs[igrad];
+            wsp_ggml_format_name(cgraph->grad_accs[igrad], "grad acc for %s", node->name);
         }
         grads_needed[igrad] = true;
     }
@@ -5761,15 +5972,15 @@ struct wsp_ggml_cgraph * wsp_ggml_new_graph(struct wsp_ggml_context * ctx) {
 struct wsp_ggml_cgraph wsp_ggml_graph_view(struct wsp_ggml_cgraph * cgraph0, int i0, int i1) {
     struct wsp_ggml_cgraph cgraph = {
-        /*.size         =*/ 0,
-        /*.n_nodes      =*/ i1 - i0,
-        /*.n_leafs      =*/ 0,
-        /*.nodes        =*/ cgraph0->nodes + i0,
-        /*.grads        =*/ cgraph0->grads ? cgraph0->grads + i0 : NULL,
-        /*.grad_accs    =*/ cgraph0->grad_accs ? cgraph0->grad_accs + i0 : NULL,
-        /*.leafs        =*/ NULL,
-        /*.hash_table   =*/ { 0, NULL, NULL },
-        /*.order        =*/ cgraph0->order,
+        /*.size             =*/ 0,
+        /*.n_nodes          =*/ i1 - i0,
+        /*.n_leafs          =*/ 0,
+        /*.nodes            =*/ cgraph0->nodes + i0,
+        /*.grads            =*/ NULL, // gradients would need visited_hash_set
+        /*.grad_accs        =*/ NULL,
+        /*.leafs            =*/ NULL,
+        /*.visited_hash_set =*/ { 0, NULL, NULL },
+        /*.order            =*/ cgraph0->order,
     };
     return cgraph;
@@ -5799,12 +6010,22 @@ void wsp_ggml_graph_cpy(struct wsp_ggml_cgraph * src, struct wsp_ggml_cgraph * d
         }
     }
+    if (dst->grads) {
+        memset(dst->grads,     0, dst->visited_hash_set.size*sizeof(struct wsp_ggml_tensor *));
+        memset(dst->grad_accs, 0, dst->visited_hash_set.size*sizeof(struct wsp_ggml_tensor *));
+    }
     if (src->grads) {
         WSP_GGML_ASSERT(dst->grads     != NULL);
         WSP_GGML_ASSERT(dst->grad_accs != NULL);
         for (int i = 0; i < src->n_nodes; ++i) {
             const size_t igrad_src = wsp_ggml_hash_find(&src->visited_hash_set, src->nodes[i]);
             const size_t igrad_dst = wsp_ggml_hash_find(&dst->visited_hash_set, dst->nodes[i]);
+            WSP_GGML_ASSERT(igrad_src != WSP_GGML_HASHSET_FULL);
+            WSP_GGML_ASSERT(wsp_ggml_bitset_get(src->visited_hash_set.used, igrad_src));
+            WSP_GGML_ASSERT(igrad_dst != WSP_GGML_HASHSET_FULL);
+            WSP_GGML_ASSERT(wsp_ggml_bitset_get(dst->visited_hash_set.used, igrad_dst));
             dst->grads[igrad_dst]     = src->grads[igrad_src];
             dst->grad_accs[igrad_dst] = src->grad_accs[igrad_src];
         }
@@ -5839,12 +6060,8 @@ void wsp_ggml_graph_reset(struct wsp_ggml_cgraph * cgraph) {
         if (node->op == WSP_GGML_OP_OPT_STEP_ADAMW) {
             // clear momenta
-            if (node->src[2]->data) {
-                wsp_ggml_set_zero(node->src[2]);
-            }
-            if (node->src[3]->data) {
-                wsp_ggml_set_zero(node->src[3]);
-            }
+            wsp_ggml_set_zero(node->src[2]);
+            wsp_ggml_set_zero(node->src[3]);
         }
         // initial gradients of loss should be 1, 0 otherwise
@@ -5923,12 +6140,12 @@ struct wsp_ggml_tensor * wsp_ggml_graph_get_tensor(const struct wsp_ggml_cgraph
 struct wsp_ggml_tensor * wsp_ggml_graph_get_grad(const struct wsp_ggml_cgraph * cgraph, const struct wsp_ggml_tensor * node) {
     const size_t igrad = wsp_ggml_hash_find(&cgraph->visited_hash_set, node);
-    return igrad != WSP_GGML_HASHSET_FULL && wsp_ggml_bitset_get(cgraph->visited_hash_set.used, igrad) ? cgraph->grads[igrad] : NULL;
+    return igrad != WSP_GGML_HASHSET_FULL && wsp_ggml_bitset_get(cgraph->visited_hash_set.used, igrad) && cgraph->grads ? cgraph->grads[igrad] : NULL;
 }
 struct wsp_ggml_tensor * wsp_ggml_graph_get_grad_acc(const struct wsp_ggml_cgraph * cgraph, const struct wsp_ggml_tensor * node) {
     const size_t igrad = wsp_ggml_hash_find(&cgraph->visited_hash_set, node);
-    return igrad != WSP_GGML_HASHSET_FULL && wsp_ggml_bitset_get(cgraph->visited_hash_set.used, igrad) ? cgraph->grad_accs[igrad] : NULL;
+    return igrad != WSP_GGML_HASHSET_FULL && wsp_ggml_bitset_get(cgraph->visited_hash_set.used, igrad) && cgraph->grad_accs ? cgraph->grad_accs[igrad] : NULL;
 }
 void wsp_ggml_graph_print(const struct wsp_ggml_cgraph * cgraph) {
@@ -6240,9 +6457,6 @@ size_t wsp_ggml_wsp_quantize_chunk(
         case WSP_GGML_TYPE_IQ1_M:   result = wsp_quantize_iq1_m  (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
         case WSP_GGML_TYPE_IQ4_NL:  result = wsp_quantize_iq4_nl (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
         case WSP_GGML_TYPE_IQ4_XS:  result = wsp_quantize_iq4_xs (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
-        case WSP_GGML_TYPE_Q4_0_4_4: result = wsp_quantize_q4_0_4x4(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
-        case WSP_GGML_TYPE_Q4_0_4_8: result = wsp_quantize_q4_0_4x8(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
-        case WSP_GGML_TYPE_Q4_0_8_8: result = wsp_quantize_q4_0_8x8(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
         case WSP_GGML_TYPE_F16:
             {
                 size_t elemsize = sizeof(wsp_ggml_fp16_t);
@@ -6272,1280 +6486,30 @@ size_t wsp_ggml_wsp_quantize_chunk(
 ////////////////////////////////////////////////////////////////////////////////
-struct wsp_gguf_str {
-    uint64_t n;  // GGUFv2
-    char * data;
-};
-static const size_t WSP_GGUF_TYPE_SIZE[WSP_GGUF_TYPE_COUNT] = {
-    [WSP_GGUF_TYPE_UINT8]   = sizeof(uint8_t),
-    [WSP_GGUF_TYPE_INT8]    = sizeof(int8_t),
-    [WSP_GGUF_TYPE_UINT16]  = sizeof(uint16_t),
-    [WSP_GGUF_TYPE_INT16]   = sizeof(int16_t),
-    [WSP_GGUF_TYPE_UINT32]  = sizeof(uint32_t),
-    [WSP_GGUF_TYPE_INT32]   = sizeof(int32_t),
-    [WSP_GGUF_TYPE_FLOAT32] = sizeof(float),
-    [WSP_GGUF_TYPE_BOOL]    = sizeof(bool),
-    [WSP_GGUF_TYPE_STRING]  = sizeof(struct wsp_gguf_str),
-    [WSP_GGUF_TYPE_UINT64]  = sizeof(uint64_t),
-    [WSP_GGUF_TYPE_INT64]   = sizeof(int64_t),
-    [WSP_GGUF_TYPE_FLOAT64] = sizeof(double),
-    [WSP_GGUF_TYPE_ARRAY]   = 0, // undefined
-};
-static_assert(WSP_GGUF_TYPE_COUNT == 13, "WSP_GGUF_TYPE_COUNT != 13");
-static const char * WSP_GGUF_TYPE_NAME[WSP_GGUF_TYPE_COUNT] = {
-    [WSP_GGUF_TYPE_UINT8]   = "u8",
-    [WSP_GGUF_TYPE_INT8]    = "i8",
-    [WSP_GGUF_TYPE_UINT16]  = "u16",
-    [WSP_GGUF_TYPE_INT16]   = "i16",
-    [WSP_GGUF_TYPE_UINT32]  = "u32",
-    [WSP_GGUF_TYPE_INT32]   = "i32",
-    [WSP_GGUF_TYPE_FLOAT32] = "f32",
-    [WSP_GGUF_TYPE_BOOL]    = "bool",
-    [WSP_GGUF_TYPE_STRING]  = "str",
-    [WSP_GGUF_TYPE_ARRAY]   = "arr",
-    [WSP_GGUF_TYPE_UINT64]  = "u64",
-    [WSP_GGUF_TYPE_INT64]   = "i64",
-    [WSP_GGUF_TYPE_FLOAT64] = "f64",
-};
-static_assert(WSP_GGUF_TYPE_COUNT == 13, "WSP_GGUF_TYPE_COUNT != 13");
-union wsp_gguf_value {
-    uint8_t  uint8;
-    int8_t   int8;
-    uint16_t uint16;
-    int16_t  int16;
-    uint32_t uint32;
-    int32_t  int32;
-    float    float32;
-    uint64_t uint64;
-    int64_t  int64;
-    double   float64;
-    bool     bool_;
-    struct wsp_gguf_str str;
-    struct {
-        enum wsp_gguf_type type;
-        uint64_t n;  // GGUFv2
-        void * data;
-    } arr;
-};
-struct wsp_gguf_kv {
-    struct wsp_gguf_str key;
-    enum  wsp_gguf_type  type;
-    union wsp_gguf_value value;
-};
-struct wsp_gguf_header {
-    char magic[4];
-    uint32_t version;
-    uint64_t n_tensors; // GGUFv2
-    uint64_t n_kv;      // GGUFv2
-};
-struct wsp_gguf_tensor_info {
-    struct wsp_gguf_str name;
-    uint32_t n_dims;
-    uint64_t ne[WSP_GGML_MAX_DIMS];
-    enum wsp_ggml_type type;
-    uint64_t offset; // offset from start of `data`, must be a multiple of `ALIGNMENT`
-    // for writing API
-    const void * data;
-    size_t size;
-};
-struct wsp_gguf_context {
-    struct wsp_gguf_header header;
-    struct wsp_gguf_kv          * kv;
-    struct wsp_gguf_tensor_info * infos;
-    size_t alignment;
-    size_t offset;    // offset of `data` from beginning of file
-    size_t size;      // size of `data` in bytes
-    //uint8_t * padding;
-    void * data;
-};
-static size_t wsp_gguf_type_size(enum wsp_gguf_type type) {
-    WSP_GGML_ASSERT(0 <= type && type < WSP_GGUF_TYPE_COUNT);
-    return WSP_GGUF_TYPE_SIZE[type];
-}
-static bool wsp_gguf_tensor_info_sanitize(struct wsp_gguf_tensor_info * info) {
-    if (info->n_dims > WSP_GGML_MAX_DIMS) {
-        fprintf(stderr, "%s: invalid number of dimensions (%" PRIu32 ")\n", __func__, info->n_dims);
-        return false;
-    }
-    if (info->type < 0 || info->type >= WSP_GGML_TYPE_COUNT) {
-        fprintf(stderr, "%s: invalid type (%d)\n", __func__, info->type);
-        return false;
-    }
-    if (strlen(info->name.data) >= WSP_GGML_MAX_NAME) {
-        fprintf(stderr, "%s: tensor '%s' name is too long\n", __func__, info->name.data);
-        return false;
-    }
-    for (uint32_t i = 0; i < info->n_dims; ++i) {
-        if (info->ne[i] <= 0) {
-            fprintf(stderr, "%s: invalid number of elements (%" PRIu64 ")\n", __func__, info->ne[i]);
-            return false;
-        }
-    }
-    // prevent overflow for total number of elements
-    if (INT64_MAX/info->ne[1] <= info->ne[0]) {
-        fprintf(stderr, "%s: invalid number of elements (%" PRIu64 ")\n", __func__, info->ne[1]);
-        return false;
-    }
-    if (INT64_MAX/info->ne[2] <= info->ne[0]*info->ne[1]) {
-        fprintf(stderr, "%s: invalid number of elements (%" PRIu64 ")\n", __func__, info->ne[2]);
-        return false;
-    }
-    if (INT64_MAX/info->ne[3] <= info->ne[0]*info->ne[1]*info->ne[2]) {
-        fprintf(stderr, "%s: invalid number of elements (%" PRIu64 ")\n", __func__, info->ne[3]);
-        return false;
-    }
-    return true;
+void wsp_ggml_log_set(wsp_ggml_log_callback log_callback, void * user_data) {
+    g_logger_state.log_callback = log_callback ? log_callback : wsp_ggml_log_callback_default;
+    g_logger_state.log_callback_user_data = user_data;
 }
-static bool wsp_gguf_fread_el(FILE * file, void * dst, size_t size, size_t * offset) {
-    const size_t n = fread(dst, 1, size, file);
-    *offset += n;
-    return n == size;
+void wsp_ggml_threadpool_params_init(struct wsp_ggml_threadpool_params * p, int n_threads) {
+    p->n_threads  = n_threads;
+    p->prio       = 0;     // default priority (usually means normal or inherited)
+    p->poll       = 50;    // hybrid-polling enabled
+    p->strict_cpu = false; // no strict placement (all threads share same cpumask)
+    p->paused     = false; // threads are ready to go
+    memset(p->cpumask, 0, WSP_GGML_MAX_N_THREADS); // all-zero means use the default affinity (usually inherited)
 }
-static bool wsp_gguf_fread_str(FILE * file, struct wsp_gguf_str * p, size_t * offset) {
-    p->n    = 0;
-    p->data = NULL;
-    bool ok = true;
-    ok = ok && wsp_gguf_fread_el(file, &p->n, sizeof(p->n), offset);
-    // early exit if string length is invalid, prevents from integer overflow
-    if (p->n == SIZE_MAX) {
-        fprintf(stderr, "%s: invalid string length (%" PRIu64 ")\n", __func__, p->n);
-        return false;
-    }
-    p->data = calloc(p->n + 1, 1);
-    if (!p->data) {
-        fprintf(stderr, "%s: failed to allocate memory for string of length %" PRIu64 "\n", __func__, p->n);
-        return false;
-    }
-    ok = ok && wsp_gguf_fread_el(file,  p->data, p->n, offset);
-    return ok;
+struct wsp_ggml_threadpool_params wsp_ggml_threadpool_params_default(int n_threads) {
+    struct wsp_ggml_threadpool_params p;
+    wsp_ggml_threadpool_params_init(&p, n_threads);
+    return p;
 }
-static void wsp_gguf_free_kv(struct wsp_gguf_kv * kv) {
-    if (kv->key.data) {
-        WSP_GGML_FREE(kv->key.data);
-    }
-    if (kv->type == WSP_GGUF_TYPE_STRING) {
-        if (kv->value.str.data) {
-            WSP_GGML_FREE(kv->value.str.data);
-        }
-    }
-    if (kv->type == WSP_GGUF_TYPE_ARRAY) {
-        if (kv->value.arr.data) {
-            if (kv->value.arr.type == WSP_GGUF_TYPE_STRING) {
-                for (uint64_t j = 0; j < kv->value.arr.n; ++j) {
-                    struct wsp_gguf_str * str = &((struct wsp_gguf_str *) kv->value.arr.data)[j];
-                    if (str->data) {
-                        WSP_GGML_FREE(str->data);
-                    }
-                }
-            }
-            WSP_GGML_FREE(kv->value.arr.data);
-        }
-    }
-}
-struct wsp_gguf_context * wsp_gguf_init_empty(void) {
-    struct wsp_gguf_context * ctx = calloc(1, sizeof(struct wsp_gguf_context));
-    if (!ctx) {
-        fprintf(stderr, "%s: failed to allocate memory for context\n", __func__);
-        return NULL;
-    }
-    memcpy(ctx->header.magic, WSP_GGUF_MAGIC, sizeof(ctx->header.magic));
-    ctx->header.version   = WSP_GGUF_VERSION;
-    ctx->header.n_tensors = 0;
-    ctx->header.n_kv      = 0;
-    ctx->kv    = NULL;
-    ctx->infos = NULL;
-    ctx->alignment = WSP_GGUF_DEFAULT_ALIGNMENT;
-    ctx->offset    = 0;
-    ctx->size      = 0;
-    ctx->data = NULL;
-    return ctx;
-}
-struct wsp_gguf_context * wsp_gguf_init_from_file(const char * fname, struct wsp_gguf_init_params params) {
-    FILE * file = wsp_ggml_fopen(fname, "rb");
-    if (!file) {
-        fprintf(stderr, "%s: failed to open '%s': '%s'\n", __func__, fname, strerror(errno));
-        return NULL;
-    }
-    // offset from start of file
-    size_t offset = 0;
-    char magic[4];
-    // check the magic before making allocations
-    {
-        wsp_gguf_fread_el(file, &magic, sizeof(magic), &offset);
-        for (uint32_t i = 0; i < sizeof(magic); i++) {
-            if (magic[i] != WSP_GGUF_MAGIC[i]) {
-                fprintf(stderr, "%s: invalid magic characters '%c%c%c%c'\n", __func__, magic[0], magic[1], magic[2], magic[3]);
-                fclose(file);
-                return NULL;
-            }
-        }
-    }
-    bool ok = true;
-    struct wsp_gguf_context * ctx = calloc(1, sizeof(struct wsp_gguf_context));
-    if (!ctx) {
-        fprintf(stderr, "%s: failed to allocate memory for context\n", __func__);
-        fclose(file);
-        return NULL;
-    }
-    // read the header
-    {
-        strncpy(ctx->header.magic, magic, 4);
-        ctx->kv    = NULL;
-        ctx->infos = NULL;
-        ctx->data  = NULL;
-        ok = ok && wsp_gguf_fread_el(file, &ctx->header.version,   sizeof(ctx->header.version),   &offset);
-        ok = ok && wsp_gguf_fread_el(file, &ctx->header.n_tensors, sizeof(ctx->header.n_tensors), &offset);
-        ok = ok && wsp_gguf_fread_el(file, &ctx->header.n_kv,      sizeof(ctx->header.n_kv),      &offset);
-        if (ctx->header.version == 1) {
-            fprintf(stderr, "%s: GGUFv1 is no longer supported. please use a more up-to-date version\n", __func__);
-            fclose(file);
-            wsp_gguf_free(ctx);
-            return NULL;
-        }
-        // sanity-checks to prevent from integer/buffer overflows
-        ok = ok && (ctx->header.n_tensors < (SIZE_MAX/2)/sizeof(struct wsp_gguf_tensor_info));
-        ok = ok && (ctx->header.n_tensors < (SIZE_MAX/2)/wsp_ggml_tensor_overhead());
-        ok = ok && (ctx->header.n_kv      < (SIZE_MAX/2)/sizeof(struct wsp_gguf_kv));
-        if (!ok) {
-            fprintf(stderr, "%s: failed to read header\n", __func__);
-            fclose(file);
-            wsp_gguf_free(ctx);
-            return NULL;
-        }
-    }
-    // read the kv pairs
-    {
-        const uint64_t n_kv = ctx->header.n_kv;
-        ctx->kv = calloc(n_kv, sizeof(struct wsp_gguf_kv));
-        if (!ctx->kv) {
-            fprintf(stderr, "%s: failed to allocate memory for kv pairs\n", __func__);
-            fclose(file);
-            wsp_gguf_free(ctx);
-            return NULL;
-        }
-        for (uint64_t i = 0; i < n_kv; ++i) {
-            struct wsp_gguf_kv * kv = &ctx->kv[i];
-            //fprintf(stderr, "%s: reading kv %d\n", __func__, i);
-            ok = ok && wsp_gguf_fread_str(file, &kv->key,                    &offset);
-            ok = ok && wsp_gguf_fread_el (file, &kv->type, sizeof(kv->type), &offset);
-            //fprintf(stderr, "%s: reading kv with key %s\n", __func__, kv->key.data);
-            switch (kv->type) {
-                case WSP_GGUF_TYPE_UINT8:   ok = ok && wsp_gguf_fread_el (file, &kv->value.uint8,   sizeof(kv->value.uint8),   &offset); break;
-                case WSP_GGUF_TYPE_INT8:    ok = ok && wsp_gguf_fread_el (file, &kv->value.int8,    sizeof(kv->value.int8),    &offset); break;
-                case WSP_GGUF_TYPE_UINT16:  ok = ok && wsp_gguf_fread_el (file, &kv->value.uint16,  sizeof(kv->value.uint16),  &offset); break;
-                case WSP_GGUF_TYPE_INT16:   ok = ok && wsp_gguf_fread_el (file, &kv->value.int16,   sizeof(kv->value.int16),   &offset); break;
-                case WSP_GGUF_TYPE_UINT32:  ok = ok && wsp_gguf_fread_el (file, &kv->value.uint32,  sizeof(kv->value.uint32),  &offset); break;
-                case WSP_GGUF_TYPE_INT32:   ok = ok && wsp_gguf_fread_el (file, &kv->value.int32,   sizeof(kv->value.int32),   &offset); break;
-                case WSP_GGUF_TYPE_FLOAT32: ok = ok && wsp_gguf_fread_el (file, &kv->value.float32, sizeof(kv->value.float32), &offset); break;
-                case WSP_GGUF_TYPE_UINT64:  ok = ok && wsp_gguf_fread_el (file, &kv->value.uint64,  sizeof(kv->value.uint64),  &offset); break;
-                case WSP_GGUF_TYPE_INT64:   ok = ok && wsp_gguf_fread_el (file, &kv->value.int64,   sizeof(kv->value.int64),   &offset); break;
-                case WSP_GGUF_TYPE_FLOAT64: ok = ok && wsp_gguf_fread_el (file, &kv->value.float64, sizeof(kv->value.float64), &offset); break;
-                case WSP_GGUF_TYPE_BOOL:    ok = ok && wsp_gguf_fread_el (file, &kv->value.bool_,   sizeof(kv->value.bool_),   &offset); break;
-                case WSP_GGUF_TYPE_STRING:  ok = ok && wsp_gguf_fread_str(file, &kv->value.str,                                &offset); break;
-                case WSP_GGUF_TYPE_ARRAY:
-                    {
-                        ok = ok && wsp_gguf_fread_el(file, &kv->value.arr.type, sizeof(kv->value.arr.type), &offset);
-                        ok = ok && wsp_gguf_fread_el(file, &kv->value.arr.n,    sizeof(kv->value.arr.n),    &offset);
-                        switch (kv->value.arr.type) {
-                            case WSP_GGUF_TYPE_UINT8:
-                            case WSP_GGUF_TYPE_INT8:
-                            case WSP_GGUF_TYPE_UINT16:
-                            case WSP_GGUF_TYPE_INT16:
-                            case WSP_GGUF_TYPE_UINT32:
-                            case WSP_GGUF_TYPE_INT32:
-                            case WSP_GGUF_TYPE_FLOAT32:
-                            case WSP_GGUF_TYPE_UINT64:
-                            case WSP_GGUF_TYPE_INT64:
-                            case WSP_GGUF_TYPE_FLOAT64:
-                            case WSP_GGUF_TYPE_BOOL:
-                                {
-                                    // prevent from integer overflow in the malloc below
-                                    if (kv->value.arr.n >= SIZE_MAX/wsp_gguf_type_size(kv->value.arr.type)) {
-                                        fprintf(stderr, "%s: array size is too large (%" PRIu64 ")\n", __func__, kv->value.arr.n);
-                                        fclose(file);
-                                        wsp_gguf_free(ctx);
-                                        return NULL;
-                                    }
-                                    kv->value.arr.data = calloc(kv->value.arr.n, wsp_gguf_type_size(kv->value.arr.type));
-                                    if (!kv->value.arr.data) {
-                                        fprintf(stderr, "%s: failed to allocate memory for array\n", __func__);
-                                        fclose(file);
-                                        wsp_gguf_free(ctx);
-                                        return NULL;
-                                    }
-                                    ok = ok && wsp_gguf_fread_el(file, kv->value.arr.data, kv->value.arr.n * wsp_gguf_type_size(kv->value.arr.type), &offset);
-                                } break;
-                            case WSP_GGUF_TYPE_STRING:
-                                {
-                                    // prevent from integer overflow in the malloc below
-                                    if (kv->value.arr.n >= SIZE_MAX/sizeof(struct wsp_gguf_str)) {
-                                        fprintf(stderr, "%s: array size is too large (%" PRIu64 ")\n", __func__, kv->value.arr.n);
-                                        fclose(file);
-                                        wsp_gguf_free(ctx);
-                                        return NULL;
-                                    }
-                                    kv->value.arr.data = calloc(kv->value.arr.n, sizeof(struct wsp_gguf_str));
-                                    if (!kv->value.arr.data) {
-                                        fprintf(stderr, "%s: failed to allocate memory for array\n", __func__);
-                                        fclose(file);
-                                        wsp_gguf_free(ctx);
-                                        return NULL;
-                                    }
-                                    for (uint64_t j = 0; j < kv->value.arr.n; ++j) {
-                                        ok = ok && wsp_gguf_fread_str(file, &((struct wsp_gguf_str *) kv->value.arr.data)[j], &offset);
-                                    }
-                                } break;
-                            case WSP_GGUF_TYPE_ARRAY:
-                            default:
-                                {
-                                    fprintf(stderr, "%s: invalid array type %d\n", __func__, kv->value.arr.type);
-                                    ok = false;
-                                } break;
-                        }
-                    } break;
-                default:
-                    {
-                        fprintf(stderr, "%s: invalid type %d\n", __func__, kv->type);
-                        ok = false;
-                    } break;
-            }
-            if (!ok) {
-                break;
-            }
-        }
-        if (!ok) {
-            fprintf(stderr, "%s: failed to read key-value pairs\n", __func__);
-            fclose(file);
-            wsp_gguf_free(ctx);
-            return NULL;
-        }
-    }
-    // read the tensor infos
-    if (ctx->header.n_tensors > 0) {
-        ctx->infos = calloc(ctx->header.n_tensors, sizeof(struct wsp_gguf_tensor_info));
-        if (!ctx->infos) {
-            fprintf(stderr, "%s: failed to allocate memory for tensor infos\n", __func__);
-            fclose(file);
-            wsp_gguf_free(ctx);
-            return NULL;
-        }
-        for (uint64_t i = 0; i < ctx->header.n_tensors; ++i) {
-            struct wsp_gguf_tensor_info * info = &ctx->infos[i];
-            for (int j = 0; j < WSP_GGML_MAX_DIMS; ++j) {
-                info->ne[j] = 1;
-            }
-            ok = ok && wsp_gguf_fread_str(file, &info->name,                          &offset);
-            ok = ok && wsp_gguf_fread_el (file, &info->n_dims, sizeof(info->n_dims),  &offset);
-            ok = ok && (info->n_dims <= WSP_GGML_MAX_DIMS);
-            for (uint32_t j = 0; j < info->n_dims; ++j) {
-                ok = ok && wsp_gguf_fread_el(file, &info->ne[j], sizeof(info->ne[j]), &offset);
-            }
-            ok = ok && wsp_gguf_fread_el (file, &info->type,   sizeof(info->type),    &offset);
-            ok = ok && wsp_gguf_fread_el (file, &info->offset, sizeof(info->offset),  &offset);
-            ok = ok && wsp_gguf_tensor_info_sanitize(info);
-            // make sure there is no duplicated tensor names
-            for (uint64_t j = 0; j < i && ok; ++j) {
-                if (strcmp(info->name.data, ctx->infos[j].name.data) == 0) {
-                    fprintf(stderr, "%s: duplicated tensor name %s\n", __func__, info->name.data);
-                    ok = false;
-                }
-            }
-            if (!ok) {
-                fprintf(stderr, "%s: failed to read tensor info\n", __func__);
-                fclose(file);
-                wsp_gguf_free(ctx);
-                return NULL;
-            }
-        }
-    }
-    ctx->alignment = WSP_GGUF_DEFAULT_ALIGNMENT;
-    int alignment_idx = wsp_gguf_find_key(ctx, "general.alignment");
-    if (alignment_idx != -1) {
-        ctx->alignment = wsp_gguf_get_val_u32(ctx, alignment_idx);
-    }
-    // we require the data section to be aligned, so take into account any padding
-    {
-        const size_t offset_pad = offset % ctx->alignment;
-        if (offset_pad != 0) {
-            offset += ctx->alignment - offset_pad;
-            fseek(file, offset, SEEK_SET);
-        }
-    }
-    // store the current file offset - this is where the data section starts
-    ctx->offset = offset;
-    // compute the total size of the data section, taking into account the alignment
-    {
-        ctx->size = 0;
-        for (uint64_t i = 0; i < ctx->header.n_tensors; ++i) {
-            struct wsp_gguf_tensor_info * info = &ctx->infos[i];
-            const int64_t ne =
-                (int64_t) info->ne[0] *
-                (int64_t) info->ne[1] *
-                (int64_t) info->ne[2] *
-                (int64_t) info->ne[3];
-            if (wsp_ggml_blck_size(info->type) == 0 || ne % wsp_ggml_blck_size(info->type) != 0) {
-                fprintf(stderr, "%s: tensor '%s' of type %d (%s) number of elements (%" PRId64 ") is not a multiple of block size (%" PRId64 ")\n",
-                        __func__, info->name.data, (int) info->type, wsp_ggml_type_name(info->type), ne, wsp_ggml_blck_size(info->type));
-                fclose(file);
-                wsp_gguf_free(ctx);
-                return NULL;
-            }
-            const size_t size_cur = wsp_ggml_row_size(info->type, ne);
-            ctx->size += WSP_GGML_PAD(size_cur, ctx->alignment);
-        }
-    }
-    // load the tensor data only if requested
-    if (params.ctx != NULL) {
-        // if the provided wsp_gguf_context is no_alloc, then we create "empty" tensors and do not read the binary blob
-        // otherwise, we load the binary blob into the created wsp_ggml_context as well, and point the "data" members of
-        // the wsp_ggml_tensor structs to the appropriate locations in the binary blob
-        // compute the exact size needed for the new wsp_ggml_context
-        const size_t mem_size =
-            params.no_alloc ?
-            (ctx->header.n_tensors    )*wsp_ggml_tensor_overhead() :
-            (ctx->header.n_tensors + 1)*wsp_ggml_tensor_overhead() + ctx->size;
-        struct wsp_ggml_init_params pdata = {
-            .mem_size   = mem_size,
-            .mem_buffer = NULL,
-            .no_alloc   = params.no_alloc,
-        };
-        *params.ctx = wsp_ggml_init(pdata);
-        if (*params.ctx == NULL) {
-            fprintf(stderr, "%s: failed to initialize context\n", __func__);
-            fclose(file);
-            wsp_gguf_free(ctx);
-            return NULL;
-        }
-        struct wsp_ggml_context * ctx_data = *params.ctx;
-        struct wsp_ggml_tensor * data = NULL;
-        if (!params.no_alloc) {
-            data = wsp_ggml_new_tensor_1d(ctx_data, WSP_GGML_TYPE_I8, ctx->size);
-            ok = ok && data != NULL;
-            // read the binary blob with the tensor data
-            ok = ok && wsp_gguf_fread_el(file, data->data, ctx->size, &offset);
-            if (!ok) {
-                fprintf(stderr, "%s: failed to read tensor data\n", __func__);
-                fclose(file);
-                wsp_ggml_free(ctx_data);
-                wsp_gguf_free(ctx);
-                return NULL;
-            }
-            ctx->data = data->data;
-        }
-        wsp_ggml_set_no_alloc(ctx_data, true);
-        // create the tensors
-        for (uint64_t i = 0; i < ctx->header.n_tensors; ++i) {
-            const int64_t ne[WSP_GGML_MAX_DIMS] = {
-                ctx->infos[i].ne[0],
-                ctx->infos[i].ne[1],
-                ctx->infos[i].ne[2],
-                ctx->infos[i].ne[3],
-            };
-            struct wsp_ggml_tensor * cur = wsp_ggml_new_tensor(ctx_data, ctx->infos[i].type, ctx->infos[i].n_dims, ne);
-            ok = ok && cur != NULL;
-            if (!ok) {
-                break;
-            }
-            wsp_ggml_set_name(cur, ctx->infos[i].name.data);
-            // point the data member to the appropriate location in the binary blob using the tensor infos
-            if (!params.no_alloc) {
-              //cur->data = (char *) data->data + ctx->infos[i].offset - ctx->offset; // offset from start of file
-                cur->data = (char *) data->data + ctx->infos[i].offset;               // offset from data
-            }
-        }
-        if (!ok) {
-            fprintf(stderr, "%s: failed to read the tensor data\n", __func__);
-            fclose(file);
-            wsp_ggml_free(ctx_data);
-            wsp_gguf_free(ctx);
-            return NULL;
-        }
-        wsp_ggml_set_no_alloc(ctx_data, params.no_alloc);
-    }
-    fclose(file);
-    return ctx;
-}
-void wsp_gguf_free(struct wsp_gguf_context * ctx) {
-    if (ctx == NULL) {
-        return;
-    }
-    if (ctx->kv) {
-        // free string memory - not great..
-        for (uint64_t i = 0; i < ctx->header.n_kv; ++i) {
-            wsp_gguf_free_kv(&ctx->kv[i]);
-        }
-        WSP_GGML_FREE(ctx->kv);
-    }
-    if (ctx->infos) {
-        for (uint64_t i = 0; i < ctx->header.n_tensors; ++i) {
-            struct wsp_gguf_tensor_info * info = &ctx->infos[i];
-            if (info->name.data) {
-                WSP_GGML_FREE(info->name.data);
-            }
-        }
-        WSP_GGML_FREE(ctx->infos);
-    }
-    WSP_GGML_FREE(ctx);
-}
-const char * wsp_gguf_type_name(enum wsp_gguf_type type) {
-    return WSP_GGUF_TYPE_NAME[type];
-}
-int wsp_gguf_get_version(const struct wsp_gguf_context * ctx) {
-    return ctx->header.version;
-}
-size_t wsp_gguf_get_alignment(const struct wsp_gguf_context * ctx) {
-    return ctx->alignment;
-}
-size_t wsp_gguf_get_data_offset(const struct wsp_gguf_context * ctx) {
-    return ctx->offset;
-}
-void * wsp_gguf_get_data(const struct wsp_gguf_context * ctx) {
-    return ctx->data;
-}
-int wsp_gguf_get_n_kv(const struct wsp_gguf_context * ctx) {
-    return ctx->header.n_kv;
-}
-int wsp_gguf_find_key(const struct wsp_gguf_context * ctx, const char * key) {
-    // return -1 if key not found
-    int keyfound = -1;
-    const int n_kv = wsp_gguf_get_n_kv(ctx);
-    for (int i = 0; i < n_kv; ++i) {
-        if (strcmp(key, wsp_gguf_get_key(ctx, i)) == 0) {
-            keyfound = i;
-            break;
-        }
-    }
-    return keyfound;
-}
-const char * wsp_gguf_get_key(const struct wsp_gguf_context * ctx, int key_id) {
-    WSP_GGML_ASSERT(key_id >= 0 && key_id < wsp_gguf_get_n_kv(ctx));
-    return ctx->kv[key_id].key.data;
-}
-enum wsp_gguf_type wsp_gguf_get_kv_type(const struct wsp_gguf_context * ctx, int key_id) {
-    WSP_GGML_ASSERT(key_id >= 0 && key_id < wsp_gguf_get_n_kv(ctx));
-    return ctx->kv[key_id].type;
-}
-enum wsp_gguf_type wsp_gguf_get_arr_type(const struct wsp_gguf_context * ctx, int key_id) {
-    WSP_GGML_ASSERT(key_id >= 0 && key_id < wsp_gguf_get_n_kv(ctx));
-    WSP_GGML_ASSERT(ctx->kv[key_id].type == WSP_GGUF_TYPE_ARRAY);
-    return ctx->kv[key_id].value.arr.type;
-}
-const void * wsp_gguf_get_arr_data(const struct wsp_gguf_context * ctx, int key_id) {
-    WSP_GGML_ASSERT(key_id >= 0 && key_id < wsp_gguf_get_n_kv(ctx));
-    WSP_GGML_ASSERT(ctx->kv[key_id].type == WSP_GGUF_TYPE_ARRAY);
-    return ctx->kv[key_id].value.arr.data;
-}
-const char * wsp_gguf_get_arr_str(const struct wsp_gguf_context * ctx, int key_id, int i) {
-    WSP_GGML_ASSERT(key_id >= 0 && key_id < wsp_gguf_get_n_kv(ctx));
-    WSP_GGML_ASSERT(ctx->kv[key_id].type == WSP_GGUF_TYPE_ARRAY);
-    struct wsp_gguf_kv * kv = &ctx->kv[key_id];
-    struct wsp_gguf_str * str = &((struct wsp_gguf_str *) kv->value.arr.data)[i];
-    return str->data;
-}
-int wsp_gguf_get_arr_n(const struct wsp_gguf_context * ctx, int key_id) {
-    WSP_GGML_ASSERT(key_id >= 0 && key_id < wsp_gguf_get_n_kv(ctx));
-    WSP_GGML_ASSERT(ctx->kv[key_id].type == WSP_GGUF_TYPE_ARRAY);
-    return ctx->kv[key_id].value.arr.n;
-}
-uint8_t wsp_gguf_get_val_u8(const struct wsp_gguf_context * ctx, int key_id) {
-    WSP_GGML_ASSERT(key_id >= 0 && key_id < wsp_gguf_get_n_kv(ctx));
-    WSP_GGML_ASSERT(ctx->kv[key_id].type == WSP_GGUF_TYPE_UINT8);
-    return ctx->kv[key_id].value.uint8;
-}
-int8_t wsp_gguf_get_val_i8(const struct wsp_gguf_context * ctx, int key_id) {
-    WSP_GGML_ASSERT(key_id >= 0 && key_id < wsp_gguf_get_n_kv(ctx));
-    WSP_GGML_ASSERT(ctx->kv[key_id].type == WSP_GGUF_TYPE_INT8);
-    return ctx->kv[key_id].value.int8;
-}
-uint16_t wsp_gguf_get_val_u16(const struct wsp_gguf_context * ctx, int key_id) {
-    WSP_GGML_ASSERT(key_id >= 0 && key_id < wsp_gguf_get_n_kv(ctx));
-    WSP_GGML_ASSERT(ctx->kv[key_id].type == WSP_GGUF_TYPE_UINT16);
-    return ctx->kv[key_id].value.uint16;
-}
-int16_t wsp_gguf_get_val_i16(const struct wsp_gguf_context * ctx, int key_id) {
-    WSP_GGML_ASSERT(key_id >= 0 && key_id < wsp_gguf_get_n_kv(ctx));
-    WSP_GGML_ASSERT(ctx->kv[key_id].type == WSP_GGUF_TYPE_INT16);
-    return ctx->kv[key_id].value.int16;
-}
-uint32_t wsp_gguf_get_val_u32(const struct wsp_gguf_context * ctx, int key_id) {
-    WSP_GGML_ASSERT(key_id >= 0 && key_id < wsp_gguf_get_n_kv(ctx));
-    WSP_GGML_ASSERT(ctx->kv[key_id].type == WSP_GGUF_TYPE_UINT32);
-    return ctx->kv[key_id].value.uint32;
-}
-int32_t wsp_gguf_get_val_i32(const struct wsp_gguf_context * ctx, int key_id) {
-    WSP_GGML_ASSERT(key_id >= 0 && key_id < wsp_gguf_get_n_kv(ctx));
-    WSP_GGML_ASSERT(ctx->kv[key_id].type == WSP_GGUF_TYPE_INT32);
-    return ctx->kv[key_id].value.int32;
-}
-float wsp_gguf_get_val_f32(const struct wsp_gguf_context * ctx, int key_id) {
-    WSP_GGML_ASSERT(key_id >= 0 && key_id < wsp_gguf_get_n_kv(ctx));
-    WSP_GGML_ASSERT(ctx->kv[key_id].type == WSP_GGUF_TYPE_FLOAT32);
-    return ctx->kv[key_id].value.float32;
-}
-uint64_t wsp_gguf_get_val_u64(const struct wsp_gguf_context * ctx, int key_id) {
-    WSP_GGML_ASSERT(key_id >= 0 && key_id < wsp_gguf_get_n_kv(ctx));
-    WSP_GGML_ASSERT(ctx->kv[key_id].type == WSP_GGUF_TYPE_UINT64);
-    return ctx->kv[key_id].value.uint64;
-}
-int64_t wsp_gguf_get_val_i64(const struct wsp_gguf_context * ctx, int key_id) {
-    WSP_GGML_ASSERT(key_id >= 0 && key_id < wsp_gguf_get_n_kv(ctx));
-    WSP_GGML_ASSERT(ctx->kv[key_id].type == WSP_GGUF_TYPE_INT64);
-    return ctx->kv[key_id].value.int64;
-}
-double wsp_gguf_get_val_f64(const struct wsp_gguf_context * ctx, int key_id) {
-    WSP_GGML_ASSERT(key_id >= 0 && key_id < wsp_gguf_get_n_kv(ctx));
-    WSP_GGML_ASSERT(ctx->kv[key_id].type == WSP_GGUF_TYPE_FLOAT64);
-    return ctx->kv[key_id].value.float64;
-}
-bool wsp_gguf_get_val_bool(const struct wsp_gguf_context * ctx, int key_id) {
-    WSP_GGML_ASSERT(key_id >= 0 && key_id < wsp_gguf_get_n_kv(ctx));
-    WSP_GGML_ASSERT(ctx->kv[key_id].type == WSP_GGUF_TYPE_BOOL);
-    return ctx->kv[key_id].value.bool_;
-}
-const char * wsp_gguf_get_val_str(const struct wsp_gguf_context * ctx, int key_id) {
-    WSP_GGML_ASSERT(key_id >= 0 && key_id < wsp_gguf_get_n_kv(ctx));
-    WSP_GGML_ASSERT(ctx->kv[key_id].type == WSP_GGUF_TYPE_STRING);
-    return ctx->kv[key_id].value.str.data;
-}
-const void * wsp_gguf_get_val_data(const struct wsp_gguf_context * ctx, int key_id) {
-    WSP_GGML_ASSERT(key_id >= 0 && key_id < wsp_gguf_get_n_kv(ctx));
-    WSP_GGML_ASSERT(ctx->kv[key_id].type != WSP_GGUF_TYPE_ARRAY);
-    WSP_GGML_ASSERT(ctx->kv[key_id].type != WSP_GGUF_TYPE_STRING);
-    return &ctx->kv[key_id].value;
-}
-int wsp_gguf_get_n_tensors(const struct wsp_gguf_context * ctx) {
-    return ctx->header.n_tensors;
-}
-int wsp_gguf_find_tensor(const struct wsp_gguf_context * ctx, const char * name) {
-    // return -1 if tensor not found
-    int tensorfound = -1;
-    const int n_tensors = wsp_gguf_get_n_tensors(ctx);
-    for (int i = 0; i < n_tensors; ++i) {
-        if (strcmp(name, wsp_gguf_get_tensor_name(ctx, i)) == 0) {
-            tensorfound = i;
-            break;
-        }
-    }
-    return tensorfound;
-}
-size_t wsp_gguf_get_tensor_offset(const struct wsp_gguf_context * ctx, int i) {
-    return ctx->infos[i].offset;
-}
-char * wsp_gguf_get_tensor_name(const struct wsp_gguf_context * ctx, int i) {
-    return ctx->infos[i].name.data;
-}
-enum wsp_ggml_type wsp_gguf_get_tensor_type(const struct wsp_gguf_context * ctx, int i) {
-    return ctx->infos[i].type;
-}
-// returns the index
-static int wsp_gguf_get_or_add_key(struct wsp_gguf_context * ctx, const char * key) {
-    const int idx = wsp_gguf_find_key(ctx, key);
-    if (idx >= 0) {
-        return idx;
-    }
-    const int n_kv = wsp_gguf_get_n_kv(ctx);
-    ctx->kv = realloc(ctx->kv, (n_kv + 1) * sizeof(struct wsp_gguf_kv));
-    ctx->kv[n_kv].key.n    = strlen(key);
-    ctx->kv[n_kv].key.data = strdup(key);
-    ctx->header.n_kv++;
-    return n_kv;
-}
-void wsp_gguf_remove_key(struct wsp_gguf_context * ctx, const char * key) {
-    const int idx = wsp_gguf_find_key(ctx, key);
-    if (idx >= 0) {
-        const int n_kv = wsp_gguf_get_n_kv(ctx);
-        wsp_gguf_free_kv(&ctx->kv[idx]);
-        for (int i = idx; i < n_kv-1; ++i) {
-            ctx->kv[i] = ctx->kv[i+1];
-        }
-        ctx->kv = realloc(ctx->kv, (n_kv - 1) * sizeof(struct wsp_gguf_kv));
-        ctx->header.n_kv--;
-    }
-}
-void wsp_gguf_set_val_u8(struct wsp_gguf_context * ctx, const char * key, uint8_t val) {
-    const int idx = wsp_gguf_get_or_add_key(ctx, key);
-    ctx->kv[idx].type        = WSP_GGUF_TYPE_UINT8;
-    ctx->kv[idx].value.uint8 = val;
-}
-void wsp_gguf_set_val_i8(struct wsp_gguf_context * ctx, const char * key, int8_t val) {
-    const int idx = wsp_gguf_get_or_add_key(ctx, key);
-    ctx->kv[idx].type       = WSP_GGUF_TYPE_INT8;
-    ctx->kv[idx].value.int8 = val;
-}
-void wsp_gguf_set_val_u16(struct wsp_gguf_context * ctx, const char * key, uint16_t val) {
-    const int idx = wsp_gguf_get_or_add_key(ctx, key);
-    ctx->kv[idx].type         = WSP_GGUF_TYPE_UINT16;
-    ctx->kv[idx].value.uint16 = val;
-}
-void wsp_gguf_set_val_i16(struct wsp_gguf_context * ctx, const char * key, int16_t val) {
-    const int idx = wsp_gguf_get_or_add_key(ctx, key);
-    ctx->kv[idx].type        = WSP_GGUF_TYPE_INT16;
-    ctx->kv[idx].value.int16 = val;
-}
-void wsp_gguf_set_val_u32(struct wsp_gguf_context * ctx, const char * key, uint32_t val) {
-    const int idx = wsp_gguf_get_or_add_key(ctx, key);
-    ctx->kv[idx].type         = WSP_GGUF_TYPE_UINT32;
-    ctx->kv[idx].value.uint32 = val;
-}
-void wsp_gguf_set_val_i32(struct wsp_gguf_context * ctx, const char * key, int32_t val) {
-    const int idx = wsp_gguf_get_or_add_key(ctx, key);
-    ctx->kv[idx].type        = WSP_GGUF_TYPE_INT32;
-    ctx->kv[idx].value.int32 = val;
-}
-void wsp_gguf_set_val_f32(struct wsp_gguf_context * ctx, const char * key, float val) {
-    const int idx = wsp_gguf_get_or_add_key(ctx, key);
-    ctx->kv[idx].type          = WSP_GGUF_TYPE_FLOAT32;
-    ctx->kv[idx].value.float32 = val;
-}
-void wsp_gguf_set_val_u64(struct wsp_gguf_context * ctx, const char * key, uint64_t val) {
-    const int idx = wsp_gguf_get_or_add_key(ctx, key);
-    ctx->kv[idx].type         = WSP_GGUF_TYPE_UINT64;
-    ctx->kv[idx].value.uint64 = val;
-}
-void wsp_gguf_set_val_i64(struct wsp_gguf_context * ctx, const char * key, int64_t val) {
-    const int idx = wsp_gguf_get_or_add_key(ctx, key);
-    ctx->kv[idx].type        = WSP_GGUF_TYPE_INT64;
-    ctx->kv[idx].value.int64 = val;
-}
-void wsp_gguf_set_val_f64(struct wsp_gguf_context * ctx, const char * key, double val) {
-    const int idx = wsp_gguf_get_or_add_key(ctx, key);
-    ctx->kv[idx].type          = WSP_GGUF_TYPE_FLOAT64;
-    ctx->kv[idx].value.float64 = val;
-}
-void wsp_gguf_set_val_bool(struct wsp_gguf_context * ctx, const char * key, bool val) {
-    const int idx = wsp_gguf_get_or_add_key(ctx, key);
-    ctx->kv[idx].type        = WSP_GGUF_TYPE_BOOL;
-    ctx->kv[idx].value.bool_ = val;
-}
-void wsp_gguf_set_val_str(struct wsp_gguf_context * ctx, const char * key, const char * val) {
-    const int idx = wsp_gguf_get_or_add_key(ctx, key);
-    ctx->kv[idx].type           = WSP_GGUF_TYPE_STRING;
-    ctx->kv[idx].value.str.n    = strlen(val);
-    ctx->kv[idx].value.str.data = strdup(val);
-}
-void wsp_gguf_set_arr_data(struct wsp_gguf_context * ctx, const char * key, enum wsp_gguf_type type, const void * data, int n) {
-    const int idx = wsp_gguf_get_or_add_key(ctx, key);
-    ctx->kv[idx].type           = WSP_GGUF_TYPE_ARRAY;
-    ctx->kv[idx].value.arr.type = type;
-    ctx->kv[idx].value.arr.n    = n;
-    ctx->kv[idx].value.arr.data = WSP_GGML_CALLOC(n, wsp_gguf_type_size(type));
-    memcpy(ctx->kv[idx].value.arr.data, data, n*wsp_gguf_type_size(type));
-}
-void wsp_gguf_set_arr_str(struct wsp_gguf_context * ctx, const char * key, const char ** data, int n) {
-    const int idx = wsp_gguf_get_or_add_key(ctx, key);
-    ctx->kv[idx].type           = WSP_GGUF_TYPE_ARRAY;
-    ctx->kv[idx].value.arr.type = WSP_GGUF_TYPE_STRING;
-    ctx->kv[idx].value.arr.n    = n;
-    ctx->kv[idx].value.arr.data = WSP_GGML_CALLOC(n, sizeof(struct wsp_gguf_str));
-    for (int i = 0; i < n; i++) {
-        struct wsp_gguf_str * str = &((struct wsp_gguf_str *)ctx->kv[idx].value.arr.data)[i];
-        str->n    = strlen(data[i]);
-        str->data = strdup(data[i]);
-    }
-}
-// set or add KV pairs from another context
-void wsp_gguf_set_kv(struct wsp_gguf_context * ctx, struct wsp_gguf_context * src) {
-    for (uint32_t i = 0; i < src->header.n_kv; i++) {
-        switch (src->kv[i].type) {
-            case WSP_GGUF_TYPE_UINT8:   wsp_gguf_set_val_u8  (ctx, src->kv[i].key.data, src->kv[i].value.uint8);    break;
-            case WSP_GGUF_TYPE_INT8:    wsp_gguf_set_val_i8  (ctx, src->kv[i].key.data, src->kv[i].value.int8);     break;
-            case WSP_GGUF_TYPE_UINT16:  wsp_gguf_set_val_u16 (ctx, src->kv[i].key.data, src->kv[i].value.uint16);   break;
-            case WSP_GGUF_TYPE_INT16:   wsp_gguf_set_val_i16 (ctx, src->kv[i].key.data, src->kv[i].value.int16);    break;
-            case WSP_GGUF_TYPE_UINT32:  wsp_gguf_set_val_u32 (ctx, src->kv[i].key.data, src->kv[i].value.uint32);   break;
-            case WSP_GGUF_TYPE_INT32:   wsp_gguf_set_val_i32 (ctx, src->kv[i].key.data, src->kv[i].value.int32);    break;
-            case WSP_GGUF_TYPE_FLOAT32: wsp_gguf_set_val_f32 (ctx, src->kv[i].key.data, src->kv[i].value.float32);  break;
-            case WSP_GGUF_TYPE_UINT64:  wsp_gguf_set_val_u64 (ctx, src->kv[i].key.data, src->kv[i].value.uint64);   break;
-            case WSP_GGUF_TYPE_INT64:   wsp_gguf_set_val_i64 (ctx, src->kv[i].key.data, src->kv[i].value.int64);    break;
-            case WSP_GGUF_TYPE_FLOAT64: wsp_gguf_set_val_f64 (ctx, src->kv[i].key.data, src->kv[i].value.float64);  break;
-            case WSP_GGUF_TYPE_BOOL:    wsp_gguf_set_val_bool(ctx, src->kv[i].key.data, src->kv[i].value.bool_);    break;
-            case WSP_GGUF_TYPE_STRING:  wsp_gguf_set_val_str (ctx, src->kv[i].key.data, src->kv[i].value.str.data); break;
-            case WSP_GGUF_TYPE_ARRAY:
-                {
-                    if (src->kv[i].value.arr.type == WSP_GGUF_TYPE_STRING) {
-                        const char ** data = WSP_GGML_CALLOC(src->kv[i].value.arr.n, sizeof(char *));
-                        for (uint32_t j = 0; j < src->kv[i].value.arr.n; j++) {
-                            data[j] = ((struct wsp_gguf_str *)src->kv[i].value.arr.data)[j].data;
-                        }
-                        wsp_gguf_set_arr_str(ctx, src->kv[i].key.data, data, src->kv[i].value.arr.n);
-                        WSP_GGML_FREE((void *)data);
-                    } else if (src->kv[i].value.arr.type == WSP_GGUF_TYPE_ARRAY) {
-                        WSP_GGML_ABORT("nested arrays not supported");
-                    } else {
-                        wsp_gguf_set_arr_data(ctx, src->kv[i].key.data, src->kv[i].value.arr.type, src->kv[i].value.arr.data, src->kv[i].value.arr.n);
-                    }
-                } break;
-            default: WSP_GGML_ABORT("invalid type");
-        }
-    }
-}
-void wsp_gguf_add_tensor(
-             struct wsp_gguf_context * ctx,
-        const struct wsp_ggml_tensor * tensor) {
-    WSP_GGML_ASSERT(tensor);
-    if (wsp_gguf_find_tensor(ctx, tensor->name) != -1) {
-        WSP_GGML_ABORT("duplicated tensor name");
-    }
-    const int idx = ctx->header.n_tensors;
-    ctx->infos = realloc(ctx->infos, (idx + 1)*sizeof(struct wsp_gguf_tensor_info));
-    ctx->infos[idx].name.n    = strlen(tensor->name);
-    ctx->infos[idx].name.data = strdup(tensor->name);
-    for (int i = 0; i < WSP_GGML_MAX_DIMS; ++i) {
-        ctx->infos[idx].ne[i] = 1;
-    }
-    ctx->infos[idx].n_dims = wsp_ggml_n_dims(tensor);
-    for (uint32_t i = 0; i < ctx->infos[idx].n_dims; i++) {
-        ctx->infos[idx].ne[i] = tensor->ne[i];
-    }
-    ctx->infos[idx].type   = tensor->type;
-    ctx->infos[idx].offset = 0;
-    ctx->infos[idx].data   = tensor->data;
-    ctx->infos[idx].size   = wsp_ggml_nbytes(tensor);
-    if (ctx->header.n_tensors > 0) {
-        ctx->infos[idx].offset = ctx->infos[idx - 1].offset + WSP_GGML_PAD(ctx->infos[idx - 1].size, ctx->alignment);
-    }
-    ctx->header.n_tensors++;
-}
-void wsp_gguf_set_tensor_type(struct wsp_gguf_context * ctx, const char * name, enum wsp_ggml_type type) {
-    const int idx = wsp_gguf_find_tensor(ctx, name);
-    if (idx < 0) {
-        WSP_GGML_ABORT("tensor not found");
-    }
-    ctx->infos[idx].type = type;
-}
-void wsp_gguf_set_tensor_data(struct wsp_gguf_context * ctx, const char * name, const void * data, size_t size) {
-    const int idx = wsp_gguf_find_tensor(ctx, name);
-    if (idx < 0) {
-        WSP_GGML_ABORT("tensor not found");
-    }
-    ctx->infos[idx].data = data;
-    ctx->infos[idx].size = size;
-    // update offsets
-    for (uint32_t i = idx + 1; i < ctx->header.n_tensors; ++i) {
-        ctx->infos[i].offset = ctx->infos[i - 1].offset + WSP_GGML_PAD(ctx->infos[i - 1].size, ctx->alignment);
-    }
-}
-//static void wsp_gguf_fwrite_str(FILE * file, const struct wsp_gguf_str * val) {
-//    fwrite(&val->n,   sizeof(val->n),    1, file);
-//    fwrite(val->data, sizeof(char), val->n, file);
-//}
-//
-//static void wsp_gguf_fwrite_el(FILE * file, const void * val, size_t size) {
-//    fwrite(val, sizeof(char), size, file);
-//}
-struct wsp_gguf_buf {
-    void * data;
-    size_t size;
-    size_t offset;
-};
-static struct wsp_gguf_buf wsp_gguf_buf_init(size_t size) {
-    struct wsp_gguf_buf buf = {
-        /*buf.data   =*/ size == 0 ? NULL : WSP_GGML_CALLOC(1, size),
-        /*buf.size   =*/ size,
-        /*buf.offset =*/ 0,
-    };
-    return buf;
-}
-static void wsp_gguf_buf_free(struct wsp_gguf_buf buf) {
-    if (buf.data) {
-        WSP_GGML_FREE(buf.data);
-    }
-}
-static void wsp_gguf_buf_grow(struct wsp_gguf_buf * buf, size_t size) {
-    if (buf->offset + size > buf->size) {
-        buf->size = 1.5*(buf->offset + size);
-        if (buf->data) {
-            buf->data = realloc(buf->data, buf->size);
-        }
-    }
-}
-static void wsp_gguf_bwrite_str(struct wsp_gguf_buf * buf, const struct wsp_gguf_str * val) {
-    wsp_gguf_buf_grow(buf, sizeof(val->n) + val->n);
-    if (buf->data) {
-        memcpy((char *) buf->data + buf->offset, &val->n, sizeof(val->n));
-    }
-    buf->offset += sizeof(val->n);
-    if (buf->data) {
-        memcpy((char *) buf->data + buf->offset, val->data, val->n);
-    }
-    buf->offset += val->n;
-}
-static void wsp_gguf_bwrite_el(struct wsp_gguf_buf * buf, const void * val, size_t el_size) {
-    wsp_gguf_buf_grow(buf, el_size);
-    if (buf->data) {
-        memcpy((char *) buf->data + buf->offset, val, el_size);
-    }
-    buf->offset += el_size;
-}
-static void wsp_gguf_write_to_buf(const struct wsp_gguf_context * ctx, struct wsp_gguf_buf * buf, bool only_meta) {
-    // write header
-    wsp_gguf_bwrite_el(buf, &ctx->header.magic,     sizeof(ctx->header.magic));
-    wsp_gguf_bwrite_el(buf, &ctx->header.version,   sizeof(ctx->header.version));
-    wsp_gguf_bwrite_el(buf, &ctx->header.n_tensors, sizeof(ctx->header.n_tensors));
-    wsp_gguf_bwrite_el(buf, &ctx->header.n_kv,      sizeof(ctx->header.n_kv));
-    // write key-value pairs
-    for (uint32_t i = 0; i < ctx->header.n_kv; ++i) {
-        struct wsp_gguf_kv * kv = &ctx->kv[i];
-        wsp_gguf_bwrite_str(buf, &kv->key);
-        wsp_gguf_bwrite_el (buf, &kv->type, sizeof(kv->type));
-        switch (kv->type) {
-            case WSP_GGUF_TYPE_UINT8:   wsp_gguf_bwrite_el( buf, &kv->value.uint8,   sizeof(kv->value.uint8)  ); break;
-            case WSP_GGUF_TYPE_INT8:    wsp_gguf_bwrite_el (buf, &kv->value.int8,    sizeof(kv->value.int8)   ); break;
-            case WSP_GGUF_TYPE_UINT16:  wsp_gguf_bwrite_el (buf, &kv->value.uint16,  sizeof(kv->value.uint16) ); break;
-            case WSP_GGUF_TYPE_INT16:   wsp_gguf_bwrite_el (buf, &kv->value.int16,   sizeof(kv->value.int16)  ); break;
-            case WSP_GGUF_TYPE_UINT32:  wsp_gguf_bwrite_el (buf, &kv->value.uint32,  sizeof(kv->value.uint32) ); break;
-            case WSP_GGUF_TYPE_INT32:   wsp_gguf_bwrite_el (buf, &kv->value.int32,   sizeof(kv->value.int32)  ); break;
-            case WSP_GGUF_TYPE_FLOAT32: wsp_gguf_bwrite_el (buf, &kv->value.float32, sizeof(kv->value.float32)); break;
-            case WSP_GGUF_TYPE_UINT64:  wsp_gguf_bwrite_el (buf, &kv->value.uint64,  sizeof(kv->value.uint64) ); break;
-            case WSP_GGUF_TYPE_INT64:   wsp_gguf_bwrite_el (buf, &kv->value.int64,   sizeof(kv->value.int64)  ); break;
-            case WSP_GGUF_TYPE_FLOAT64: wsp_gguf_bwrite_el (buf, &kv->value.float64, sizeof(kv->value.float64)); break;
-            case WSP_GGUF_TYPE_BOOL:    wsp_gguf_bwrite_el (buf, &kv->value.bool_,   sizeof(kv->value.bool_)  ); break;
-            case WSP_GGUF_TYPE_STRING:  wsp_gguf_bwrite_str(buf, &kv->value.str                               ); break;
-            case WSP_GGUF_TYPE_ARRAY:
-                {
-                    wsp_gguf_bwrite_el(buf, &kv->value.arr.type, sizeof(kv->value.arr.type));
-                    wsp_gguf_bwrite_el(buf, &kv->value.arr.n,    sizeof(kv->value.arr.n)   );
-                    switch (kv->value.arr.type) {
-                        case WSP_GGUF_TYPE_UINT8:
-                        case WSP_GGUF_TYPE_INT8:
-                        case WSP_GGUF_TYPE_UINT16:
-                        case WSP_GGUF_TYPE_INT16:
-                        case WSP_GGUF_TYPE_UINT32:
-                        case WSP_GGUF_TYPE_INT32:
-                        case WSP_GGUF_TYPE_FLOAT32:
-                        case WSP_GGUF_TYPE_UINT64:
-                        case WSP_GGUF_TYPE_INT64:
-                        case WSP_GGUF_TYPE_FLOAT64:
-                        case WSP_GGUF_TYPE_BOOL:
-                            {
-                                wsp_gguf_bwrite_el(buf, kv->value.arr.data, kv->value.arr.n * wsp_gguf_type_size(kv->value.arr.type));
-                            } break;
-                        case WSP_GGUF_TYPE_STRING:
-                            {
-                                for (uint32_t j = 0; j < kv->value.arr.n; ++j) {
-                                    wsp_gguf_bwrite_str(buf, &((struct wsp_gguf_str *) kv->value.arr.data)[j]);
-                                }
-                            } break;
-                        case WSP_GGUF_TYPE_ARRAY:
-                        default: WSP_GGML_ABORT("invalid type");
-                    }
-                } break;
-            default: WSP_GGML_ABORT("invalid type");
-        }
-    }
-    // write tensor infos
-    for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) {
-        struct wsp_gguf_tensor_info * info = &ctx->infos[i];
-        wsp_gguf_bwrite_str(buf, &info->name);
-        wsp_gguf_bwrite_el (buf, &info->n_dims, sizeof(info->n_dims));
-        for (uint32_t j = 0; j < info->n_dims; ++j) {
-            wsp_gguf_bwrite_el(buf, &info->ne[j], sizeof(info->ne[j]));
-        }
-        wsp_gguf_bwrite_el(buf, &info->type,   sizeof(info->type));
-        wsp_gguf_bwrite_el(buf, &info->offset, sizeof(info->offset));
-    }
-    // we require the data section to be aligned, so take into account any padding
-    {
-        const size_t offset     = buf->offset;
-        const size_t offset_pad = WSP_GGML_PAD(offset, ctx->alignment);
-        if (offset_pad != offset) {
-            uint8_t pad = 0;
-            for (size_t i = 0; i < offset_pad - offset; ++i) {
-                wsp_gguf_bwrite_el(buf, &pad, sizeof(pad));
-            }
-        }
-    }
-    if (only_meta) {
-        return;
-    }
-    size_t offset = 0;
-    // write tensor data
-    for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) {
-        struct wsp_gguf_tensor_info * info = &ctx->infos[i];
-        const size_t size     = info->size;
-        const size_t size_pad = WSP_GGML_PAD(size, ctx->alignment);
-        wsp_gguf_bwrite_el(buf, info->data, size);
-        if (size_pad != size) {
-            uint8_t pad = 0;
-            for (size_t j = 0; j < size_pad - size; ++j) {
-                wsp_gguf_bwrite_el(buf, &pad, sizeof(pad));
-            }
-        }
-        WSP_GGML_ASSERT(offset == info->offset);
-        offset += size_pad;
-    }
-}
-void wsp_gguf_write_to_file(const struct wsp_gguf_context * ctx, const char * fname, bool only_meta) {
-    FILE * file = wsp_ggml_fopen(fname, "wb");
-    if (!file) {
-        WSP_GGML_ABORT("failed to open file for writing");
-    }
-    struct wsp_gguf_buf buf = wsp_gguf_buf_init(16*1024);
-    wsp_gguf_write_to_buf(ctx, &buf, only_meta);
-    fwrite(buf.data, 1, buf.offset, file);
-    wsp_gguf_buf_free(buf);
-    fclose(file);
-}
-size_t wsp_gguf_get_meta_size(const struct wsp_gguf_context * ctx) {
-    // no allocs - only compute size
-    struct wsp_gguf_buf buf = wsp_gguf_buf_init(0);
-    wsp_gguf_write_to_buf(ctx, &buf, true);
-    return buf.offset;
-}
-void wsp_gguf_get_meta_data(const struct wsp_gguf_context * ctx, void * data) {
-    struct wsp_gguf_buf buf = wsp_gguf_buf_init(16*1024);
-    wsp_gguf_write_to_buf(ctx, &buf, true);
-    memcpy(data, buf.data, buf.offset);
-    wsp_gguf_buf_free(buf);
-}
-void wsp_ggml_log_set(wsp_ggml_log_callback log_callback, void * user_data) {
-    g_logger_state.log_callback = log_callback ? log_callback : wsp_ggml_log_callback_default;
-    g_logger_state.log_callback_user_data = user_data;
+bool wsp_ggml_threadpool_params_match(const struct wsp_ggml_threadpool_params * p0, const struct wsp_ggml_threadpool_params * p1) {
+    if (p0->n_threads      != p1->n_threads  )    return false;
+    if (p0->prio           != p1->prio       )    return false;
+    if (p0->poll           != p1->poll       )    return false;
+    if (p0->strict_cpu     != p1->strict_cpu )    return false;
+    return memcmp(p0->cpumask, p1->cpumask, WSP_GGML_MAX_N_THREADS) == 0;
 }