npm - whisper.rn - Versions diffs - 0.4.0-rc.4 → 0.4.0-rc.6 - Mend

whisper.rn 0.4.0-rc.4 → 0.4.0-rc.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (49) hide show

package/README.md +6 -6
package/android/build.gradle +4 -0
package/android/src/main/CMakeLists.txt +5 -0
package/android/src/main/java/com/rnwhisper/AudioUtils.java +0 -80
package/android/src/main/java/com/rnwhisper/WhisperContext.java +57 -134
package/android/src/main/jni-utils.h +76 -0
package/android/src/main/jni.cpp +188 -112
package/cpp/README.md +1 -1
package/cpp/coreml/whisper-encoder-impl.h +1 -1
package/cpp/coreml/whisper-encoder.h +4 -0
package/cpp/coreml/whisper-encoder.mm +4 -2
package/cpp/ggml-alloc.c +55 -19
package/cpp/ggml-alloc.h +8 -1
package/cpp/ggml-backend-impl.h +46 -21
package/cpp/ggml-backend.c +563 -156
package/cpp/ggml-backend.h +62 -17
package/cpp/ggml-impl.h +1 -1
package/cpp/ggml-metal-whisper.metal +2444 -359
package/cpp/ggml-metal.h +7 -1
package/cpp/ggml-metal.m +1105 -197
package/cpp/ggml-quants.c +66 -61
package/cpp/ggml-quants.h +40 -40
package/cpp/ggml.c +1040 -1590
package/cpp/ggml.h +109 -30
package/cpp/rn-audioutils.cpp +68 -0
package/cpp/rn-audioutils.h +14 -0
package/cpp/rn-whisper-log.h +11 -0
package/cpp/rn-whisper.cpp +143 -59
package/cpp/rn-whisper.h +48 -15
package/cpp/whisper.cpp +1635 -928
package/cpp/whisper.h +55 -10
package/ios/RNWhisper.mm +7 -7
package/ios/RNWhisperAudioUtils.h +0 -2
package/ios/RNWhisperAudioUtils.m +0 -56
package/ios/RNWhisperContext.h +3 -11
package/ios/RNWhisperContext.mm +68 -137
package/lib/commonjs/index.js.map +1 -1
package/lib/commonjs/version.json +1 -1
package/lib/module/index.js.map +1 -1
package/lib/module/version.json +1 -1
package/lib/typescript/index.d.ts +5 -0
package/lib/typescript/index.d.ts.map +1 -1
package/package.json +6 -5
package/src/index.ts +5 -0
package/src/version.json +1 -1
package/ios/RNWhisper.xcodeproj/project.xcworkspace/contents.xcworkspacedata +0 -4
package/ios/RNWhisper.xcodeproj/project.xcworkspace/xcshareddata/IDEWorkspaceChecks.plist +0 -8
package/ios/RNWhisper.xcodeproj/project.xcworkspace/xcuserdata/jhen.xcuserdatad/UserInterfaceState.xcuserstate +0 -0
package/ios/RNWhisper.xcodeproj/xcuserdata/jhen.xcuserdatad/xcschemes/xcschememanagement.plist +0 -19

package/cpp/ggml.c CHANGED Viewed

@@ -1,4 +1,4 @@
-#define _CRT_SECURE_NO_DEPRECATE // Disables ridiculous "unsafe" warnigns on Windows
+#define _CRT_SECURE_NO_DEPRECATE // Disables ridiculous "unsafe" warnings on Windows
 #define _USE_MATH_DEFINES // For M_PI on MSVC
 #include "ggml-impl.h"
@@ -33,7 +33,7 @@
 // we should just be careful :)
 #pragma warning(disable: 4244 4267)
-// disable POSIX deprecation warnigns
+// disable POSIX deprecation warnings
 // these functions are never going away, anyway
 #pragma warning(disable: 4996)
 #endif
@@ -233,24 +233,6 @@ inline static void * wsp_ggml_aligned_malloc(size_t size) {
 #define UNUSED WSP_GGML_UNUSED
 #define SWAP(x, y, T) do { T SWAP = x; x = y; y = SWAP; } while (0)
-//
-// tensor access macros
-//
-#define WSP_GGML_TENSOR_UNARY_OP_LOCALS \
-    WSP_GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne) \
-    WSP_GGML_TENSOR_LOCALS(size_t,  nb0, src0, nb) \
-    WSP_GGML_TENSOR_LOCALS(int64_t, ne,  dst,  ne) \
-    WSP_GGML_TENSOR_LOCALS(size_t,  nb,  dst,  nb)
-#define WSP_GGML_TENSOR_BINARY_OP_LOCALS \
-    WSP_GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne) \
-    WSP_GGML_TENSOR_LOCALS(size_t,  nb0, src0, nb) \
-    WSP_GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne) \
-    WSP_GGML_TENSOR_LOCALS(size_t,  nb1, src1, nb) \
-    WSP_GGML_TENSOR_LOCALS(int64_t, ne,  dst,  ne) \
-    WSP_GGML_TENSOR_LOCALS(size_t,  nb,  dst,  nb)
 #if defined(WSP_GGML_USE_ACCELERATE)
 #include <Accelerate/Accelerate.h>
 #if defined(WSP_GGML_USE_CLBLAST) // allow usage of CLBlast alongside Accelerate functions
@@ -455,9 +437,9 @@ static const wsp_ggml_type_traits_t type_traits[WSP_GGML_TYPE_COUNT] = {
         .blck_size                = QK4_0,
         .type_size                = sizeof(block_q4_0),
         .is_quantized             = true,
-        .to_float                 = (wsp_ggml_to_float_t) dequantize_row_q4_0,
-        .from_float               = quantize_row_q4_0,
-        .from_float_reference     = (wsp_ggml_from_float_t) quantize_row_q4_0_reference,
+        .to_float                 = (wsp_ggml_to_float_t) wsp_dewsp_quantize_row_q4_0,
+        .from_float               = wsp_quantize_row_q4_0,
+        .from_float_reference     = (wsp_ggml_from_float_t) wsp_quantize_row_q4_0_reference,
         .vec_dot                  = wsp_ggml_vec_dot_q4_0_q8_0,
         .vec_dot_type             = WSP_GGML_TYPE_Q8_0,
     },
@@ -466,9 +448,9 @@ static const wsp_ggml_type_traits_t type_traits[WSP_GGML_TYPE_COUNT] = {
         .blck_size                = QK4_1,
         .type_size                = sizeof(block_q4_1),
         .is_quantized             = true,
-        .to_float                 = (wsp_ggml_to_float_t) dequantize_row_q4_1,
-        .from_float               = quantize_row_q4_1,
-        .from_float_reference     = (wsp_ggml_from_float_t) quantize_row_q4_1_reference,
+        .to_float                 = (wsp_ggml_to_float_t) wsp_dewsp_quantize_row_q4_1,
+        .from_float               = wsp_quantize_row_q4_1,
+        .from_float_reference     = (wsp_ggml_from_float_t) wsp_quantize_row_q4_1_reference,
         .vec_dot                  = wsp_ggml_vec_dot_q4_1_q8_1,
         .vec_dot_type             = WSP_GGML_TYPE_Q8_1,
     },
@@ -499,9 +481,9 @@ static const wsp_ggml_type_traits_t type_traits[WSP_GGML_TYPE_COUNT] = {
         .blck_size                = QK5_0,
         .type_size                = sizeof(block_q5_0),
         .is_quantized             = true,
-        .to_float                 = (wsp_ggml_to_float_t) dequantize_row_q5_0,
-        .from_float               = quantize_row_q5_0,
-        .from_float_reference     = (wsp_ggml_from_float_t) quantize_row_q5_0_reference,
+        .to_float                 = (wsp_ggml_to_float_t) wsp_dewsp_quantize_row_q5_0,
+        .from_float               = wsp_quantize_row_q5_0,
+        .from_float_reference     = (wsp_ggml_from_float_t) wsp_quantize_row_q5_0_reference,
         .vec_dot                  = wsp_ggml_vec_dot_q5_0_q8_0,
         .vec_dot_type             = WSP_GGML_TYPE_Q8_0,
     },
@@ -510,9 +492,9 @@ static const wsp_ggml_type_traits_t type_traits[WSP_GGML_TYPE_COUNT] = {
         .blck_size                = QK5_1,
         .type_size                = sizeof(block_q5_1),
         .is_quantized             = true,
-        .to_float                 = (wsp_ggml_to_float_t) dequantize_row_q5_1,
-        .from_float               = quantize_row_q5_1,
-        .from_float_reference     = (wsp_ggml_from_float_t) quantize_row_q5_1_reference,
+        .to_float                 = (wsp_ggml_to_float_t) wsp_dewsp_quantize_row_q5_1,
+        .from_float               = wsp_quantize_row_q5_1,
+        .from_float_reference     = (wsp_ggml_from_float_t) wsp_quantize_row_q5_1_reference,
         .vec_dot                  = wsp_ggml_vec_dot_q5_1_q8_1,
         .vec_dot_type             = WSP_GGML_TYPE_Q8_1,
     },
@@ -521,9 +503,9 @@ static const wsp_ggml_type_traits_t type_traits[WSP_GGML_TYPE_COUNT] = {
         .blck_size                = QK8_0,
         .type_size                = sizeof(block_q8_0),
         .is_quantized             = true,
-        .to_float                 = (wsp_ggml_to_float_t) dequantize_row_q8_0,
-        .from_float               = quantize_row_q8_0,
-        .from_float_reference     = (wsp_ggml_from_float_t) quantize_row_q8_0_reference,
+        .to_float                 = (wsp_ggml_to_float_t) wsp_dewsp_quantize_row_q8_0,
+        .from_float               = wsp_quantize_row_q8_0,
+        .from_float_reference     = (wsp_ggml_from_float_t) wsp_quantize_row_q8_0_reference,
         .vec_dot                  = wsp_ggml_vec_dot_q8_0_q8_0,
         .vec_dot_type             = WSP_GGML_TYPE_Q8_0,
     },
@@ -532,8 +514,8 @@ static const wsp_ggml_type_traits_t type_traits[WSP_GGML_TYPE_COUNT] = {
         .blck_size                = QK8_1,
         .type_size                = sizeof(block_q8_1),
         .is_quantized             = true,
-        .from_float               = quantize_row_q8_1,
-        .from_float_reference     = (wsp_ggml_from_float_t) quantize_row_q8_1_reference,
+        .from_float               = wsp_quantize_row_q8_1,
+        .from_float_reference     = (wsp_ggml_from_float_t) wsp_quantize_row_q8_1_reference,
         .vec_dot_type             = WSP_GGML_TYPE_Q8_1,
     },
     [WSP_GGML_TYPE_Q2_K] = {
@@ -541,9 +523,9 @@ static const wsp_ggml_type_traits_t type_traits[WSP_GGML_TYPE_COUNT] = {
         .blck_size                = QK_K,
         .type_size                = sizeof(block_q2_K),
         .is_quantized             = true,
-        .to_float                 = (wsp_ggml_to_float_t) dequantize_row_q2_K,
-        .from_float               = quantize_row_q2_K,
-        .from_float_reference     = (wsp_ggml_from_float_t) quantize_row_q2_K_reference,
+        .to_float                 = (wsp_ggml_to_float_t) wsp_dewsp_quantize_row_q2_K,
+        .from_float               = wsp_quantize_row_q2_K,
+        .from_float_reference     = (wsp_ggml_from_float_t) wsp_quantize_row_q2_K_reference,
         .vec_dot                  = wsp_ggml_vec_dot_q2_K_q8_K,
         .vec_dot_type             = WSP_GGML_TYPE_Q8_K,
     },
@@ -552,9 +534,9 @@ static const wsp_ggml_type_traits_t type_traits[WSP_GGML_TYPE_COUNT] = {
         .blck_size                = QK_K,
         .type_size                = sizeof(block_q3_K),
         .is_quantized             = true,
-        .to_float                 = (wsp_ggml_to_float_t) dequantize_row_q3_K,
-        .from_float               = quantize_row_q3_K,
-        .from_float_reference     = (wsp_ggml_from_float_t) quantize_row_q3_K_reference,
+        .to_float                 = (wsp_ggml_to_float_t) wsp_dewsp_quantize_row_q3_K,
+        .from_float               = wsp_quantize_row_q3_K,
+        .from_float_reference     = (wsp_ggml_from_float_t) wsp_quantize_row_q3_K_reference,
         .vec_dot                  = wsp_ggml_vec_dot_q3_K_q8_K,
         .vec_dot_type             = WSP_GGML_TYPE_Q8_K,
     },
@@ -563,9 +545,9 @@ static const wsp_ggml_type_traits_t type_traits[WSP_GGML_TYPE_COUNT] = {
         .blck_size                = QK_K,
         .type_size                = sizeof(block_q4_K),
         .is_quantized             = true,
-        .to_float                 = (wsp_ggml_to_float_t) dequantize_row_q4_K,
-        .from_float               = quantize_row_q4_K,
-        .from_float_reference     = (wsp_ggml_from_float_t) quantize_row_q4_K_reference,
+        .to_float                 = (wsp_ggml_to_float_t) wsp_dewsp_quantize_row_q4_K,
+        .from_float               = wsp_quantize_row_q4_K,
+        .from_float_reference     = (wsp_ggml_from_float_t) wsp_quantize_row_q4_K_reference,
         .vec_dot                  = wsp_ggml_vec_dot_q4_K_q8_K,
         .vec_dot_type             = WSP_GGML_TYPE_Q8_K,
     },
@@ -574,9 +556,9 @@ static const wsp_ggml_type_traits_t type_traits[WSP_GGML_TYPE_COUNT] = {
         .blck_size                = QK_K,
         .type_size                = sizeof(block_q5_K),
         .is_quantized             = true,
-        .to_float                 = (wsp_ggml_to_float_t) dequantize_row_q5_K,
-        .from_float               = quantize_row_q5_K,
-        .from_float_reference     = (wsp_ggml_from_float_t) quantize_row_q5_K_reference,
+        .to_float                 = (wsp_ggml_to_float_t) wsp_dewsp_quantize_row_q5_K,
+        .from_float               = wsp_quantize_row_q5_K,
+        .from_float_reference     = (wsp_ggml_from_float_t) wsp_quantize_row_q5_K_reference,
         .vec_dot                  = wsp_ggml_vec_dot_q5_K_q8_K,
         .vec_dot_type             = WSP_GGML_TYPE_Q8_K,
     },
@@ -585,9 +567,9 @@ static const wsp_ggml_type_traits_t type_traits[WSP_GGML_TYPE_COUNT] = {
         .blck_size                = QK_K,
         .type_size                = sizeof(block_q6_K),
         .is_quantized             = true,
-        .to_float                 = (wsp_ggml_to_float_t) dequantize_row_q6_K,
-        .from_float               = quantize_row_q6_K,
-        .from_float_reference     = (wsp_ggml_from_float_t) quantize_row_q6_K_reference,
+        .to_float                 = (wsp_ggml_to_float_t) wsp_dewsp_quantize_row_q6_K,
+        .from_float               = wsp_quantize_row_q6_K,
+        .from_float_reference     = (wsp_ggml_from_float_t) wsp_quantize_row_q6_K_reference,
         .vec_dot                  = wsp_ggml_vec_dot_q6_K_q8_K,
         .vec_dot_type             = WSP_GGML_TYPE_Q8_K,
     },
@@ -596,7 +578,7 @@ static const wsp_ggml_type_traits_t type_traits[WSP_GGML_TYPE_COUNT] = {
         .blck_size                = QK_K,
         .type_size                = sizeof(block_q8_K),
         .is_quantized             = true,
-        .from_float               = quantize_row_q8_K,
+        .from_float               = wsp_quantize_row_q8_K,
     }
 };
@@ -1413,7 +1395,7 @@ inline static void wsp_ggml_vec_step_f32 (const int n, float * y, const float *
 inline static void wsp_ggml_vec_tanh_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = tanhf(x[i]);  }
 inline static void wsp_ggml_vec_elu_f32  (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : expf(x[i])-1; }
 inline static void wsp_ggml_vec_relu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : 0.f; }
-inline static void wsp_ggml_vec_leaky_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : 0.1f*x[i]; }
+inline static void wsp_ggml_vec_leaky_relu_f32 (const int n, float * y, const float * x, const float ns) { for (int i = 0; i < n; ++i) y[i] = ((x[i] > 0.f) ? x[i] : 0.f) + ns * ((x[i] < 0.0f) ? x[i] : 0.f); }
 static const float GELU_COEF_A     = 0.044715f;
 static const float GELU_QUICK_COEF = -1.702f;
@@ -1613,6 +1595,7 @@ static const char * WSP_GGML_OP_NAME[WSP_GGML_OP_COUNT] = {
     "GROUP_NORM",
     "MUL_MAT",
+    "MUL_MAT_ID",
     "OUT_PROD",
     "SCALE",
@@ -1634,17 +1617,15 @@ static const char * WSP_GGML_OP_NAME[WSP_GGML_OP_COUNT] = {
     "ROPE_BACK",
     "ALIBI",
     "CLAMP",
-    "CONV_1D",
-    "CONV_1D_STAGE_0",
-    "CONV_1D_STAGE_1",
     "CONV_TRANSPOSE_1D",
-    "CONV_2D",
-    "CONV_2D_STAGE_0",
-    "CONV_2D_STAGE_1",
+    "IM2COL",
     "CONV_TRANSPOSE_2D",
     "POOL_1D",
     "POOL_2D",
     "UPSCALE",
+    "PAD",
+    "ARGSORT",
+    "LEAKY_RELU",
     "FLASH_ATTN",
     "FLASH_FF",
@@ -1671,7 +1652,7 @@ static const char * WSP_GGML_OP_NAME[WSP_GGML_OP_COUNT] = {
     "CROSS_ENTROPY_LOSS_BACK",
 };
-static_assert(WSP_GGML_OP_COUNT == 73, "WSP_GGML_OP_COUNT != 73");
+static_assert(WSP_GGML_OP_COUNT == 72, "WSP_GGML_OP_COUNT != 72");
 static const char * WSP_GGML_OP_SYMBOL[WSP_GGML_OP_COUNT] = {
     "none",
@@ -1700,6 +1681,7 @@ static const char * WSP_GGML_OP_SYMBOL[WSP_GGML_OP_COUNT] = {
     "group_norm(x)",
     "X*Y",
+    "X[i]*Y",
     "X*Y",
     "x*v",
@@ -1721,17 +1703,15 @@ static const char * WSP_GGML_OP_SYMBOL[WSP_GGML_OP_COUNT] = {
     "rope_back(x)",
     "alibi(x)",
     "clamp(x)",
-    "conv_1d(x)",
-    "conv_1d_stage_0(x)",
-    "conv_1d_stage_1(x)",
     "conv_transpose_1d(x)",
-    "conv_2d(x)",
-    "conv_2d_stage_0(x)",
-    "conv_2d_stage_1(x)",
+    "im2col(x)",
     "conv_transpose_2d(x)",
     "pool_1d(x)",
     "pool_2d(x)",
     "upscale(x)",
+    "pad(x)",
+    "argsort(x)",
+    "leaky_relu(x)",
     "flash_attn(x)",
     "flash_ff(x)",
@@ -1758,15 +1738,32 @@ static const char * WSP_GGML_OP_SYMBOL[WSP_GGML_OP_COUNT] = {
     "cross_entropy_loss_back(x,y)",
 };
-static_assert(WSP_GGML_OP_COUNT == 73, "WSP_GGML_OP_COUNT != 73");
+static_assert(WSP_GGML_OP_COUNT == 72, "WSP_GGML_OP_COUNT != 72");
 static_assert(WSP_GGML_OP_POOL_COUNT == 2, "WSP_GGML_OP_POOL_COUNT != 2");
+static const char * WSP_GGML_UNARY_OP_NAME[WSP_GGML_UNARY_OP_COUNT] = {
+    "ABS",
+    "SGN",
+    "NEG",
+    "STEP",
+    "TANH",
+    "ELU",
+    "RELU",
+    "GELU",
+    "GELU_QUICK",
+    "SILU",
+};
+static_assert(WSP_GGML_UNARY_OP_COUNT == 10, "WSP_GGML_UNARY_OP_COUNT != 10");
 static_assert(sizeof(struct wsp_ggml_object)%WSP_GGML_MEM_ALIGN == 0, "wsp_ggml_object size must be a multiple of WSP_GGML_MEM_ALIGN");
 static_assert(sizeof(struct wsp_ggml_tensor)%WSP_GGML_MEM_ALIGN == 0, "wsp_ggml_tensor size must be a multiple of WSP_GGML_MEM_ALIGN");
 // WARN:
-// Mis-confguration can lead to problem that's hard to reason about:
+// Mis-configuration can lead to problem that's hard to reason about:
 // * At best  it crash or talks nosense.
 // * At worst it talks slightly difference but hard to perceive.
 //
@@ -1781,18 +1778,13 @@ static void wsp_ggml_setup_op_has_task_pass(void) {
         p[WSP_GGML_OP_ACC                    ] = true;
         p[WSP_GGML_OP_MUL_MAT                ] = true;
+        p[WSP_GGML_OP_MUL_MAT_ID             ] = true;
         p[WSP_GGML_OP_OUT_PROD               ] = true;
         p[WSP_GGML_OP_SET                    ] = true;
         p[WSP_GGML_OP_GET_ROWS_BACK          ] = true;
         p[WSP_GGML_OP_DIAG_MASK_INF          ] = true;
         p[WSP_GGML_OP_DIAG_MASK_ZERO         ] = true;
-        p[WSP_GGML_OP_CONV_1D                ] = true;
-        p[WSP_GGML_OP_CONV_1D_STAGE_0        ] = true;
-        p[WSP_GGML_OP_CONV_1D_STAGE_1        ] = true;
         p[WSP_GGML_OP_CONV_TRANSPOSE_1D      ] = true;
-        p[WSP_GGML_OP_CONV_2D                ] = true;
-        p[WSP_GGML_OP_CONV_2D_STAGE_0        ] = true;
-        p[WSP_GGML_OP_CONV_2D_STAGE_1        ] = true;
         p[WSP_GGML_OP_CONV_TRANSPOSE_2D      ] = true;
         p[WSP_GGML_OP_FLASH_ATTN_BACK        ] = true;
         p[WSP_GGML_OP_CROSS_ENTROPY_LOSS     ] = true;
@@ -2039,6 +2031,20 @@ const char * wsp_ggml_op_symbol(enum wsp_ggml_op op) {
     return WSP_GGML_OP_SYMBOL[op];
 }
+const char * wsp_ggml_unary_op_name(enum wsp_ggml_unary_op op) {
+    return WSP_GGML_UNARY_OP_NAME[op];
+}
+const char * wsp_ggml_op_desc(const struct wsp_ggml_tensor * t) {
+    if (t->op == WSP_GGML_OP_UNARY) {
+        enum wsp_ggml_unary_op uop = wsp_ggml_get_unary_op(t);
+        return wsp_ggml_unary_op_name(uop);
+    }
+    else {
+        return wsp_ggml_op_name(t->op);
+    }
+}
 size_t wsp_ggml_element_size(const struct wsp_ggml_tensor * tensor) {
     return wsp_ggml_type_size(tensor->type);
 }
@@ -3170,9 +3176,7 @@ static struct wsp_ggml_tensor * wsp_ggml_add_impl(
         struct wsp_ggml_tensor * a,
         struct wsp_ggml_tensor * b,
         bool inplace) {
-    // TODO: support less-strict constraint
-    //       WSP_GGML_ASSERT(wsp_ggml_can_repeat(b, a));
-    WSP_GGML_ASSERT(wsp_ggml_can_repeat_rows(b, a));
+    WSP_GGML_ASSERT(wsp_ggml_can_repeat(b, a));
     bool is_node = false;
@@ -3387,9 +3391,7 @@ static struct wsp_ggml_tensor * wsp_ggml_mul_impl(
         struct wsp_ggml_tensor * a,
         struct wsp_ggml_tensor * b,
         bool inplace) {
-    // TODO: support less-strict constraint
-    //       WSP_GGML_ASSERT(wsp_ggml_can_repeat(b, a));
-    WSP_GGML_ASSERT(wsp_ggml_can_repeat_rows(b, a));
+    WSP_GGML_ASSERT(wsp_ggml_can_repeat(b, a));
     bool is_node = false;
@@ -3434,7 +3436,7 @@ static struct wsp_ggml_tensor * wsp_ggml_div_impl(
         struct wsp_ggml_tensor * a,
         struct wsp_ggml_tensor * b,
         bool inplace) {
-    WSP_GGML_ASSERT(wsp_ggml_are_same_shape(a, b));
+    WSP_GGML_ASSERT(wsp_ggml_can_repeat(b, a));
     bool is_node = false;
@@ -3831,12 +3833,25 @@ struct wsp_ggml_tensor * wsp_ggml_relu_inplace(
     return wsp_ggml_unary_inplace(ctx, a, WSP_GGML_UNARY_OP_RELU);
 }
-// wsp_ggml_leaky
+// wsp_ggml_leaky_relu
-struct wsp_ggml_tensor * wsp_ggml_leaky(
+struct wsp_ggml_tensor * wsp_ggml_leaky_relu(
         struct wsp_ggml_context * ctx,
-        struct wsp_ggml_tensor  * a) {
-    return wsp_ggml_unary(ctx, a, WSP_GGML_UNARY_OP_LEAKY);
+        struct wsp_ggml_tensor  * a, float negative_slope, bool inplace) {
+    bool is_node = false;
+    if (!inplace && (a->grad)) {
+        is_node = true;
+    }
+    struct wsp_ggml_tensor * result = inplace ? wsp_ggml_view_tensor(ctx, a) : wsp_ggml_dup_tensor(ctx, a);
+    wsp_ggml_set_op_params(result, &negative_slope, sizeof(negative_slope));
+    result->op   = WSP_GGML_OP_LEAKY_RELU;
+    result->grad = is_node ? wsp_ggml_dup_tensor(ctx, result) : NULL;
+    result->src[0] = a;
+    return result;
 }
 // wsp_ggml_gelu
@@ -4023,8 +4038,9 @@ static struct wsp_ggml_tensor * wsp_ggml_group_norm_impl(
     struct wsp_ggml_tensor * result = inplace ? wsp_ggml_view_tensor(ctx, a) : wsp_ggml_dup_tensor(ctx, a);
-    result->op = WSP_GGML_OP_GROUP_NORM;
     result->op_params[0] = n_groups;
+    result->op = WSP_GGML_OP_GROUP_NORM;
     result->grad = is_node ? wsp_ggml_dup_tensor(ctx, result) : NULL;
     result->src[0] = a;
     result->src[1] = NULL; // TODO: maybe store epsilon here?
@@ -4072,6 +4088,51 @@ struct wsp_ggml_tensor * wsp_ggml_mul_mat(
     return result;
 }
+// wsp_ggml_mul_mat_id
+struct wsp_ggml_tensor * wsp_ggml_mul_mat_id(
+        struct wsp_ggml_context * ctx,
+        struct wsp_ggml_tensor  * const as[],
+        int                   n_as,
+        struct wsp_ggml_tensor  * ids,
+        int                   id,
+        struct wsp_ggml_tensor  * b) {
+    WSP_GGML_ASSERT(ids->type == WSP_GGML_TYPE_I32);
+    WSP_GGML_ASSERT(ids->ne[2] == 1 && ids->ne[3] == 1);
+    WSP_GGML_ASSERT(ids->ne[1] == b->ne[1]);
+    WSP_GGML_ASSERT(ids->ne[2] == b->ne[2] && ids->ne[3] == b->ne[3]);
+    WSP_GGML_ASSERT(n_as > 0 && n_as <= WSP_GGML_MAX_SRC - 2);
+    WSP_GGML_ASSERT(id >= 0 && id < ids->ne[0]);
+    bool is_node = false;
+    if (as[0]->grad || b->grad) {
+        is_node = true;
+    }
+    const int64_t ne[4] = { as[0]->ne[1], b->ne[1], b->ne[2], b->ne[3] };
+    struct wsp_ggml_tensor * result = wsp_ggml_new_tensor(ctx, WSP_GGML_TYPE_F32, MAX(as[0]->n_dims, b->n_dims), ne);
+    wsp_ggml_set_op_params_i32(result, 0, id);
+    wsp_ggml_set_op_params_i32(result, 1, n_as);
+    result->op   = WSP_GGML_OP_MUL_MAT_ID;
+    result->grad = is_node ? wsp_ggml_dup_tensor(ctx, result) : NULL;
+    result->src[0] = ids;
+    result->src[1] = b;
+    for (int i = 0; i < n_as; i++) {
+        struct wsp_ggml_tensor * a = as[i];
+        WSP_GGML_ASSERT(wsp_ggml_are_same_shape(as[0], a));
+        WSP_GGML_ASSERT(wsp_ggml_can_mul_mat(a, b));
+        WSP_GGML_ASSERT(!wsp_ggml_is_transposed(a));
+        result->src[i + 2] = a;
+    }
+    return result;
+}
 // wsp_ggml_out_prod
 struct wsp_ggml_tensor * wsp_ggml_out_prod(
@@ -4225,7 +4286,7 @@ struct wsp_ggml_tensor * wsp_ggml_set_2d_inplace(
         struct wsp_ggml_tensor *  b,
         size_t                nb1,
         size_t                offset) {
-    return wsp_ggml_set_impl(ctx, a, b, nb1, a->nb[2], a->nb[3], offset, false);
+    return wsp_ggml_set_impl(ctx, a, b, nb1, a->nb[2], a->nb[3], offset, true);
 }
 // wsp_ggml_cpy
@@ -4689,7 +4750,9 @@ struct wsp_ggml_tensor * wsp_ggml_get_rows(
         struct wsp_ggml_context * ctx,
         struct wsp_ggml_tensor  * a,
         struct wsp_ggml_tensor  * b) {
-    WSP_GGML_ASSERT(wsp_ggml_is_matrix(a) && wsp_ggml_is_vector(b) && b->type == WSP_GGML_TYPE_I32);
+    WSP_GGML_ASSERT(a->ne[2] == b->ne[1]);
+    WSP_GGML_ASSERT(b->ne[3] == 1);
+    WSP_GGML_ASSERT(b->type == WSP_GGML_TYPE_I32);
     bool is_node = false;
@@ -4699,7 +4762,7 @@ struct wsp_ggml_tensor * wsp_ggml_get_rows(
     // TODO: implement non F32 return
     //struct wsp_ggml_tensor * result = wsp_ggml_new_tensor_2d(ctx, a->type, a->ne[0], b->ne[0]);
-    struct wsp_ggml_tensor * result = wsp_ggml_new_tensor_2d(ctx, WSP_GGML_TYPE_F32, a->ne[0], b->ne[0]);
+    struct wsp_ggml_tensor * result = wsp_ggml_new_tensor_4d(ctx, WSP_GGML_TYPE_F32, a->ne[0], b->ne[0], b->ne[1], b->ne[2]);
     result->op   = WSP_GGML_OP_GET_ROWS;
     result->grad = is_node ? wsp_ggml_dup_tensor(ctx, result) : NULL;
@@ -4842,7 +4905,17 @@ struct wsp_ggml_tensor * wsp_ggml_diag_mask_zero_inplace(
 static struct wsp_ggml_tensor * wsp_ggml_soft_max_impl(
         struct wsp_ggml_context * ctx,
         struct wsp_ggml_tensor  * a,
+        struct wsp_ggml_tensor  * mask,
+        float                 scale,
         bool                  inplace) {
+    WSP_GGML_ASSERT(wsp_ggml_is_contiguous(a));
+    if (mask) {
+        WSP_GGML_ASSERT(wsp_ggml_is_contiguous(mask));
+        WSP_GGML_ASSERT(mask->ne[2] == 1);
+        WSP_GGML_ASSERT(mask->ne[3] == 1);
+        WSP_GGML_ASSERT(wsp_ggml_can_repeat_rows(mask, a));
+    }
     bool is_node = false;
     if (a->grad) {
@@ -4851,9 +4924,13 @@ static struct wsp_ggml_tensor * wsp_ggml_soft_max_impl(
     struct wsp_ggml_tensor * result = inplace ? wsp_ggml_view_tensor(ctx, a) : wsp_ggml_dup_tensor(ctx, a);
+    float params[] = { scale };
+    wsp_ggml_set_op_params(result, params, sizeof(params));
     result->op   = WSP_GGML_OP_SOFT_MAX;
     result->grad = is_node ? wsp_ggml_dup_tensor(ctx, result) : NULL;
     result->src[0] = a;
+    result->src[1] = mask;
     return result;
 }
@@ -4861,13 +4938,21 @@ static struct wsp_ggml_tensor * wsp_ggml_soft_max_impl(
 struct wsp_ggml_tensor * wsp_ggml_soft_max(
         struct wsp_ggml_context * ctx,
         struct wsp_ggml_tensor  * a) {
-    return wsp_ggml_soft_max_impl(ctx, a, false);
+    return wsp_ggml_soft_max_impl(ctx, a, NULL, 1.0f, false);
 }
 struct wsp_ggml_tensor * wsp_ggml_soft_max_inplace(
         struct wsp_ggml_context * ctx,
         struct wsp_ggml_tensor  * a) {
-    return wsp_ggml_soft_max_impl(ctx, a, true);
+    return wsp_ggml_soft_max_impl(ctx, a, NULL, 1.0f, true);
+}
+struct wsp_ggml_tensor * wsp_ggml_soft_max_ext(
+        struct wsp_ggml_context * ctx,
+        struct wsp_ggml_tensor  * a,
+        struct wsp_ggml_tensor  * mask,
+        float                 scale) {
+    return wsp_ggml_soft_max_impl(ctx, a, mask, scale, false);
 }
 // wsp_ggml_soft_max_back
@@ -5040,8 +5125,13 @@ struct wsp_ggml_tensor * wsp_ggml_rope_back(
         int                   n_dims,
         int                   mode,
         int                   n_ctx,
+        int                   n_orig_ctx,
         float                 freq_base,
         float                 freq_scale,
+        float                 ext_factor,
+        float                 attn_factor,
+        float                 beta_fast,
+        float                 beta_slow,
         float                 xpos_base,
         bool                  xpos_down) {
     WSP_GGML_ASSERT(wsp_ggml_is_vector(b));
@@ -5058,11 +5148,15 @@ struct wsp_ggml_tensor * wsp_ggml_rope_back(
     struct wsp_ggml_tensor * result = wsp_ggml_dup_tensor(ctx, a);
-    int32_t params[8] = { /*n_past*/ 0, n_dims, mode, n_ctx };
-    memcpy(params + 4, &freq_base,  sizeof(float));
-    memcpy(params + 5, &freq_scale, sizeof(float));
-    memcpy(params + 6, &xpos_base,  sizeof(float));
-    memcpy(params + 7, &xpos_down,  sizeof(bool));
+    int32_t params[13] = { /*n_past*/ 0, n_dims, mode, n_ctx, n_orig_ctx };
+    memcpy(params +  5, &freq_base,    sizeof(float));
+    memcpy(params +  6, &freq_scale,   sizeof(float));
+    memcpy(params +  7, &ext_factor,   sizeof(float));
+    memcpy(params +  8, &attn_factor,  sizeof(float));
+    memcpy(params +  9, &beta_fast,    sizeof(float));
+    memcpy(params + 10, &beta_slow,    sizeof(float));
+    memcpy(params + 11, &xpos_base,    sizeof(float));
+    memcpy(params + 12, &xpos_down,    sizeof(bool));
     wsp_ggml_set_op_params(result, params, sizeof(params));
     result->op   = WSP_GGML_OP_ROPE_BACK;
@@ -5137,82 +5231,6 @@ static int64_t wsp_ggml_calc_conv_output_size(int64_t ins, int64_t ks, int s, in
     return (ins + 2 * p - d * (ks - 1) - 1) / s + 1;
 }
-// im2col: [N, IC, IL] => [N, OL, IC*K]
-// a: [OC，IC, K]
-// b: [N, IC, IL]
-// result: [N, OL, IC*K]
-static struct wsp_ggml_tensor * wsp_ggml_conv_1d_stage_0(
-    struct wsp_ggml_context * ctx,
-    struct wsp_ggml_tensor  * a,
-    struct wsp_ggml_tensor  * b,
-    int                   s0,
-    int                   p0,
-    int                   d0) {
-    WSP_GGML_ASSERT(a->ne[1] == b->ne[1]);
-    bool is_node = false;
-    if (a->grad || b->grad) {
-        WSP_GGML_ASSERT(false); // TODO: implement backward
-        is_node = true;
-    }
-    const int64_t OL = wsp_ggml_calc_conv_output_size(b->ne[0], a->ne[0], s0, p0, d0);
-    const int64_t ne[4] = {
-        a->ne[1] * a->ne[0],
-        OL,
-        b->ne[2],
-        1,
-    };
-    struct wsp_ggml_tensor * result = wsp_ggml_new_tensor(ctx, WSP_GGML_TYPE_F16, 4, ne);
-    int32_t params[] = { s0, p0, d0 };
-    wsp_ggml_set_op_params(result, params, sizeof(params));
-    result->op = WSP_GGML_OP_CONV_1D_STAGE_0;
-    result->grad = is_node ? wsp_ggml_dup_tensor(ctx, result) : NULL;
-    result->src[0] = a;
-    result->src[1] = b;
-    return result;
-}
-// wsp_ggml_conv_1d_stage_1
-// gemm: [N, OC, OL] = [OC, IC * K] x [N*OL, IC * K]
-// a: [OC, IC, K]
-// b: [N, OL, IC * K]
-// result: [N, OC, OL]
-static struct wsp_ggml_tensor * wsp_ggml_conv_1d_stage_1(
-    struct wsp_ggml_context * ctx,
-    struct wsp_ggml_tensor  * a,
-    struct wsp_ggml_tensor  * b) {
-    bool is_node = false;
-    if (a->grad || b->grad) {
-        WSP_GGML_ASSERT(false); // TODO: implement backward
-        is_node = true;
-    }
-    const int64_t ne[4] = {
-        b->ne[1],
-        a->ne[2],
-        b->ne[2],
-        1,
-    };
-    struct wsp_ggml_tensor * result = wsp_ggml_new_tensor(ctx, WSP_GGML_TYPE_F32, 4, ne);
-    result->op = WSP_GGML_OP_CONV_1D_STAGE_1;
-    result->grad = is_node ? wsp_ggml_dup_tensor(ctx, result) : NULL;
-    result->src[0] = a;
-    result->src[1] = b;
-    return result;
-}
-// wsp_ggml_conv_1d
 WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_conv_1d(
         struct wsp_ggml_context * ctx,
         struct wsp_ggml_tensor  * a,
@@ -5220,43 +5238,17 @@ WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_conv_1d(
         int                   s0,
         int                   p0,
         int                   d0) {
-    struct wsp_ggml_tensor * result = wsp_ggml_conv_1d_stage_0(ctx, a, b, s0, p0, d0);
-    result = wsp_ggml_conv_1d_stage_1(ctx, a, result);
-    return result;
-}
-// WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_conv_1d(
-//         struct wsp_ggml_context * ctx,
-//         struct wsp_ggml_tensor  * a,
-//         struct wsp_ggml_tensor  * b,
-//         int                   s0,
-//         int                   p0,
-//         int                   d0) {
-//     WSP_GGML_ASSERT(wsp_ggml_is_matrix(b));
-//     WSP_GGML_ASSERT(a->ne[1] == b->ne[1]);
-//     bool is_node = false;
-//     if (a->grad || b->grad) {
-//         WSP_GGML_ASSERT(false); // TODO: implement backward
-//         is_node = true;
-//     }
+    struct wsp_ggml_tensor * im2col = wsp_ggml_im2col(ctx, a, b, s0, 0, p0, 0, d0, 0, false); // [N, OL, IC * K]
-//     const int64_t ne[4] = {
-//         wsp_ggml_calc_conv_output_size(b->ne[0], a->ne[0], s0, p0, d0),
-//         a->ne[2], 1, 1,
-//     };
-//     struct wsp_ggml_tensor * result = wsp_ggml_new_tensor(ctx, WSP_GGML_TYPE_F32, 2, ne);
+    struct wsp_ggml_tensor * result =
+        wsp_ggml_mul_mat(ctx,
+                wsp_ggml_reshape_2d(ctx, im2col, im2col->ne[0], (im2col->ne[2] * im2col->ne[1])), // [N, OL, IC * K] => [N*OL, IC * K]
+                wsp_ggml_reshape_2d(ctx, a, (a->ne[0] * a->ne[1]), a->ne[2]));                    // [OC，IC, K] => [OC, IC * K]
-//     int32_t params[] = { s0, p0, d0 };
-//     wsp_ggml_set_op_params(result, params, sizeof(params));
+    result = wsp_ggml_reshape_3d(ctx, result, im2col->ne[1], a->ne[2], im2col->ne[2]); // [N, OC, OL]
-//     result->op = WSP_GGML_OP_CONV_1D;
-//     result->grad = is_node ? wsp_ggml_dup_tensor(ctx, result) : NULL;
-//     result->src[0] = a;
-//     result->src[1] = b;
-//     return result;
-// }
+    return result;
+}
 // wsp_ggml_conv_1d_ph
@@ -5319,7 +5311,7 @@ WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_conv_transpose_1d(
 // a: [OC，IC, KH, KW]
 // b: [N, IC, IH, IW]
 // result: [N, OH, OW, IC*KH*KW]
-static struct wsp_ggml_tensor * wsp_ggml_conv_2d_stage_0(
+struct wsp_ggml_tensor * wsp_ggml_im2col(
     struct wsp_ggml_context * ctx,
     struct wsp_ggml_tensor  * a,
     struct wsp_ggml_tensor  * b,
@@ -5328,9 +5320,14 @@ static struct wsp_ggml_tensor * wsp_ggml_conv_2d_stage_0(
     int                  p0,
     int                  p1,
     int                  d0,
-    int                  d1) {
+    int                  d1,
+    bool                 is_2D) {
-    WSP_GGML_ASSERT(a->ne[2] == b->ne[2]);
+    if(is_2D) {
+        WSP_GGML_ASSERT(a->ne[2] == b->ne[2]);
+    } else {
+        WSP_GGML_ASSERT(a->ne[1] == b->ne[1]);
+    }
     bool is_node = false;
     if (a->grad || b->grad) {
@@ -5338,81 +5335,51 @@ static struct wsp_ggml_tensor * wsp_ggml_conv_2d_stage_0(
         is_node = true;
     }
-    const int64_t OH = wsp_ggml_calc_conv_output_size(b->ne[1], a->ne[1], s1, p1, d1);
-    const int64_t OW = wsp_ggml_calc_conv_output_size(b->ne[0], a->ne[0], s0, p0, d0);
+    const int64_t OH = is_2D ? wsp_ggml_calc_conv_output_size(b->ne[1], a->ne[1], s1, p1, d1) : 0;
+    const int64_t OW =         wsp_ggml_calc_conv_output_size(b->ne[0], a->ne[0], s0, p0, d0);
     const int64_t ne[4] = {
-        a->ne[2] * a->ne[1] * a->ne[0],
+        is_2D ? (a->ne[2] * a->ne[1] * a->ne[0]) : a->ne[1] * a->ne[0],
         OW,
-        OH,
-        b->ne[3],
+        is_2D ? OH : b->ne[2],
+        is_2D ?      b->ne[3] : 1,
     };
-    struct wsp_ggml_tensor * result = wsp_ggml_new_tensor(ctx, WSP_GGML_TYPE_F16, 4, ne);
-    int32_t params[] = { s0, s1, p0, p1, d0, d1 };
+    struct wsp_ggml_tensor * result = wsp_ggml_new_tensor(ctx, WSP_GGML_TYPE_F16, 4, ne);
+    int32_t params[] = { s0, s1, p0, p1, d0, d1, (is_2D ? 1 : 0) };
     wsp_ggml_set_op_params(result, params, sizeof(params));
-    result->op = WSP_GGML_OP_CONV_2D_STAGE_0;
-    result->grad = is_node ? wsp_ggml_dup_tensor(ctx, result) : NULL;
-    result->src[0] = a;
-    result->src[1] = b;
-    return result;
-}
-// gemm: [N, OC, OH, OW] = [OC, IC * KH * KW] x [N*OH*OW, IC * KH * KW]
-// a: [OC, IC, KH, KW]
-// b: [N, OH, OW, IC * KH * KW]
-// result: [N, OC, OH, OW]
-static struct wsp_ggml_tensor * wsp_ggml_conv_2d_stage_1(
-    struct wsp_ggml_context * ctx,
-    struct wsp_ggml_tensor  * a,
-    struct wsp_ggml_tensor  * b) {
-    bool is_node = false;
-    if (a->grad || b->grad) {
-        WSP_GGML_ASSERT(false); // TODO: implement backward
-        is_node = true;
-    }
-    const int64_t ne[4] = {
-        b->ne[1],
-        b->ne[2],
-        a->ne[3],
-        b->ne[3],
-    };
-    struct wsp_ggml_tensor * result = wsp_ggml_new_tensor(ctx, WSP_GGML_TYPE_F32, 4, ne);
-    result->op = WSP_GGML_OP_CONV_2D_STAGE_1;
+    result->op = WSP_GGML_OP_IM2COL;
     result->grad = is_node ? wsp_ggml_dup_tensor(ctx, result) : NULL;
     result->src[0] = a;
     result->src[1] = b;
     return result;
 }
 // a: [OC，IC, KH, KW]
 // b: [N, IC, IH, IW]
 // result: [N, OC, OH, OW]
 struct wsp_ggml_tensor * wsp_ggml_conv_2d(
-    struct wsp_ggml_context * ctx,
-    struct wsp_ggml_tensor  * a,
-    struct wsp_ggml_tensor  * b,
-    int                  s0,
-    int                  s1,
-    int                  p0,
-    int                  p1,
-    int                  d0,
-    int                  d1) {
+        struct wsp_ggml_context * ctx,
+        struct wsp_ggml_tensor  * a,
+        struct wsp_ggml_tensor  * b,
+        int                  s0,
+        int                  s1,
+        int                  p0,
+        int                  p1,
+        int                  d0,
+        int                  d1) {
+    struct wsp_ggml_tensor * im2col = wsp_ggml_im2col(ctx, a, b, s0, s1, p0, p1, d0, d1, true); // [N, OH, OW, IC * KH * KW]
-    struct wsp_ggml_tensor * result = wsp_ggml_conv_2d_stage_0(ctx, a, b, s0, s1, p0, p1, d0, d1); // [N, OH, OW, IC * KH * KW]
-    result = wsp_ggml_conv_2d_stage_1(ctx, a, result);
+    struct wsp_ggml_tensor * result =
+        wsp_ggml_mul_mat(ctx,
+                wsp_ggml_reshape_2d(ctx, im2col, im2col->ne[0],  im2col->ne[3] * im2col->ne[2] * im2col->ne[1]), // [N, OH, OW, IC * KH * KW] => [N*OH*OW, IC * KH * KW]
+                wsp_ggml_reshape_2d(ctx, a, (a->ne[0] * a->ne[1] * a->ne[2]),  a->ne[3]));                       // [OC，IC, KH, KW] => [OC, IC * KH * KW]
-    return result;
+    result = wsp_ggml_reshape_4d(ctx, result, im2col->ne[1], im2col->ne[2], a->ne[3], im2col->ne[3]); // [N, OC, OH, OW]
+    return result;
 }
 // wsp_ggml_conv_2d_sk_p0
@@ -5573,6 +5540,30 @@ static struct wsp_ggml_tensor * wsp_ggml_upscale_impl(
     return result;
 }
+struct wsp_ggml_tensor * wsp_ggml_pad(
+    struct wsp_ggml_context * ctx,
+    struct wsp_ggml_tensor  * a,
+    int p0, int p1, int p2, int p3) {
+    bool is_node = false;
+    if (a->grad) {
+        WSP_GGML_ASSERT(false); // TODO: implement backward
+        is_node = true;
+    }
+    struct wsp_ggml_tensor * result = wsp_ggml_new_tensor_4d(ctx, a->type,
+            a->ne[0] + p0,
+            a->ne[1] + p1,
+            a->ne[2] + p2,
+            a->ne[3] + p3);
+    result->op = WSP_GGML_OP_PAD;
+    result->grad = is_node ? wsp_ggml_dup_tensor(ctx, result) : NULL;
+    result->src[0] = a;
+    return result;
+}
 struct wsp_ggml_tensor * wsp_ggml_upscale(
     struct wsp_ggml_context * ctx,
     struct wsp_ggml_tensor * a,
@@ -5580,6 +5571,43 @@ struct wsp_ggml_tensor * wsp_ggml_upscale(
     return wsp_ggml_upscale_impl(ctx, a, scale_factor);
 }
+// wsp_ggml_argsort
+struct wsp_ggml_tensor * wsp_ggml_argsort(
+        struct wsp_ggml_context * ctx,
+        struct wsp_ggml_tensor  * a,
+        enum wsp_ggml_sort_order  order) {
+    bool is_node = false;
+    struct wsp_ggml_tensor * result = wsp_ggml_new_tensor(ctx, WSP_GGML_TYPE_I32, a->n_dims, a->ne);
+    wsp_ggml_set_op_params_i32(result, 0, (int32_t) order);
+    result->op   = WSP_GGML_OP_ARGSORT;
+    result->grad = is_node ? wsp_ggml_dup_tensor(ctx, result) : NULL;
+    result->src[0] = a;
+    return result;
+}
+// wsp_ggml_top_k
+struct wsp_ggml_tensor * wsp_ggml_top_k(
+        struct wsp_ggml_context * ctx,
+        struct wsp_ggml_tensor  * a,
+        int                   k) {
+    WSP_GGML_ASSERT(a->ne[0] >= k);
+    struct wsp_ggml_tensor * result = wsp_ggml_argsort(ctx, a, WSP_GGML_SORT_DESC);
+    result = wsp_ggml_view_4d(ctx, result,
+                k, result->ne[1], result->ne[2], result->ne[3],
+                   result->nb[1], result->nb[2], result->nb[3],
+                0);
+    return result;
+}
 // wsp_ggml_flash_attn
 struct wsp_ggml_tensor * wsp_ggml_flash_attn(
@@ -6472,7 +6500,7 @@ static void wsp_ggml_compute_forward_dup_f16(
                     }
                 }
             } else if (type_traits[dst->type].from_float) {
-                wsp_ggml_from_float_t const quantize_row_q = type_traits[dst->type].from_float;
+                wsp_ggml_from_float_t const wsp_quantize_row_q = type_traits[dst->type].from_float;
                 float * src0_f32 = (float *) params->wdata + (ne00 + CACHE_LINE_SIZE_F32) * ith;
                 size_t id = 0;
@@ -6489,7 +6517,7 @@ static void wsp_ggml_compute_forward_dup_f16(
                                 src0_f32[i00] = WSP_GGML_FP16_TO_FP32(src0_ptr[i00]);
                             }
-                            quantize_row_q(src0_f32, dst_ptr + id, ne00);
+                            wsp_quantize_row_q(src0_f32, dst_ptr + id, ne00);
                             id += rs;
                         }
                         id += rs * (ne01 - ir1);
@@ -6725,7 +6753,7 @@ static void wsp_ggml_compute_forward_dup_f32(
                     }
                 }
             } else if (type_traits[dst->type].from_float) {
-                wsp_ggml_from_float_t const quantize_row_q = type_traits[dst->type].from_float;
+                wsp_ggml_from_float_t const wsp_quantize_row_q = type_traits[dst->type].from_float;
                 size_t id = 0;
                 size_t rs = nb0 * (ne00 / wsp_ggml_blck_size(dst->type));
@@ -6736,7 +6764,7 @@ static void wsp_ggml_compute_forward_dup_f32(
                         id += rs * ir0;
                         for (int i01 = ir0; i01 < ir1; i01++) {
                             const float * src0_ptr = (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03);
-                            quantize_row_q(src0_ptr, dst_ptr + id, ne00);
+                            wsp_quantize_row_q(src0_ptr, dst_ptr + id, ne00);
                             id += rs;
                         }
                         id += rs * (ne01 - ir1);
@@ -6939,7 +6967,7 @@ static void wsp_ggml_compute_forward_add_f32(
         const struct wsp_ggml_tensor * src0,
         const struct wsp_ggml_tensor * src1,
         struct wsp_ggml_tensor * dst) {
-    WSP_GGML_ASSERT(wsp_ggml_can_repeat_rows(src1, src0) && wsp_ggml_are_same_shape(src0, dst));
+    WSP_GGML_ASSERT(wsp_ggml_can_repeat(src1, src0) && wsp_ggml_are_same_shape(src0, dst));
     if (params->type == WSP_GGML_TASK_INIT || params->type == WSP_GGML_TASK_FINALIZE) {
         return;
@@ -6972,16 +7000,19 @@ static void wsp_ggml_compute_forward_add_f32(
             const int64_t i13 = i03 % ne13;
             const int64_t i12 = i02 % ne12;
             const int64_t i11 = i01 % ne11;
+            const int64_t nr0 = ne00 / ne10;
             float * dst_ptr  = (float *) ((char *) dst->data  + i03*nb3  + i02*nb2  + i01*nb1 );
             float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
             float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11);
+            for (int64_t r = 0; r < nr0; ++r) {
 #ifdef WSP_GGML_USE_ACCELERATE
-            vDSP_vadd(src0_ptr, 1, src1_ptr, 1, dst_ptr, 1, ne00);
+                vDSP_vadd(src0_ptr + r*ne10, 1, src1_ptr, 1, dst_ptr + r*ne10, 1, ne10);
 #else
-            wsp_ggml_vec_add_f32(ne00, dst_ptr, src0_ptr, src1_ptr);
+                wsp_ggml_vec_add_f32(ne10, dst_ptr + r*ne10, src0_ptr + r*ne10, src1_ptr);
 #endif
+            }
         }
     } else {
         // src1 is not contiguous
@@ -6998,8 +7029,9 @@ static void wsp_ggml_compute_forward_add_f32(
             float * dst_ptr  = (float *) ((char *) dst->data  + i03*nb3  + i02*nb2  + i01*nb1 );
             float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
-            for (int i0 = 0; i0 < ne0; i0++) {
-                float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11 + i0*nb10);
+            for (int64_t i0 = 0; i0 < ne0; ++i0) {
+                const int64_t i10 = i0 % ne10;
+                float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11 + i10*nb10);
                 dst_ptr[i0] = src0_ptr[i0] + *src1_ptr;
             }
@@ -7158,8 +7190,8 @@ static void wsp_ggml_compute_forward_add_q_f32(
     const enum wsp_ggml_type type = src0->type;
     const enum wsp_ggml_type dtype = dst->type;
-    wsp_ggml_to_float_t const dequantize_row_q = type_traits[type].to_float;
-    wsp_ggml_from_float_t const quantize_row_q = type_traits[dtype].from_float;
+    wsp_ggml_to_float_t const wsp_dewsp_quantize_row_q = type_traits[type].to_float;
+    wsp_ggml_from_float_t const wsp_quantize_row_q = type_traits[dtype].from_float;
     // we don't support permuted src0 or src1
     WSP_GGML_ASSERT(nb00 == wsp_ggml_type_size(type));
@@ -7204,12 +7236,12 @@ static void wsp_ggml_compute_forward_add_q_f32(
         assert(ne00 % 32 == 0);
         // unquantize row from src0 to temp buffer
-        dequantize_row_q(src0_row, wdata, ne00);
+        wsp_dewsp_quantize_row_q(src0_row, wdata, ne00);
         // add src1
         wsp_ggml_vec_acc_f32(ne00, wdata, src1_row);
         // quantize row to dst
-        if (quantize_row_q != NULL) {
-            quantize_row_q(wdata, dst_row, ne00);
+        if (wsp_quantize_row_q != NULL) {
+            wsp_quantize_row_q(wdata, dst_row, ne00);
         } else {
             memcpy(dst_row, wdata, ne0*nb0);
         }
@@ -7435,8 +7467,8 @@ static void wsp_ggml_compute_forward_add1_q_f32(
     WSP_GGML_TENSOR_UNARY_OP_LOCALS
     const enum wsp_ggml_type type = src0->type;
-    wsp_ggml_to_float_t const dequantize_row_q = type_traits[type].to_float;
-    wsp_ggml_from_float_t const quantize_row_q = type_traits[type].from_float;
+    wsp_ggml_to_float_t const wsp_dewsp_quantize_row_q = type_traits[type].to_float;
+    wsp_ggml_from_float_t const wsp_quantize_row_q = type_traits[type].from_float;
     // we don't support permuted src0
     WSP_GGML_ASSERT(nb00 == wsp_ggml_type_size(type));
@@ -7471,11 +7503,11 @@ static void wsp_ggml_compute_forward_add1_q_f32(
         assert(ne0 % 32 == 0);
         // unquantize row from src0 to temp buffer
-        dequantize_row_q(src0_row, wdata, ne0);
+        wsp_dewsp_quantize_row_q(src0_row, wdata, ne0);
         // add src1
         wsp_ggml_vec_acc1_f32(ne0, wdata, v);
         // quantize row to dst
-        quantize_row_q(wdata, dst_row, ne0);
+        wsp_quantize_row_q(wdata, dst_row, ne0);
     }
 }
@@ -7533,7 +7565,7 @@ static void wsp_ggml_compute_forward_acc_f32(
     WSP_GGML_ASSERT(wsp_ggml_is_contiguous(dst) && wsp_ggml_is_contiguous(src0));
     // view src0 and dst with these strides and data offset inbytes during acc
-    // nb0 is implicitely element_size because src0 and dst are contiguous
+    // nb0 is implicitly element_size because src0 and dst are contiguous
     size_t nb1     = ((int32_t *) dst->op_params)[0];
     size_t nb2     = ((int32_t *) dst->op_params)[1];
     size_t nb3     = ((int32_t *) dst->op_params)[2];
@@ -7719,7 +7751,7 @@ static void wsp_ggml_compute_forward_mul_f32(
         const struct wsp_ggml_tensor * src0,
         const struct wsp_ggml_tensor * src1,
         struct wsp_ggml_tensor * dst) {
-    WSP_GGML_ASSERT(wsp_ggml_can_repeat_rows(src1, src0) && wsp_ggml_are_same_shape(src0, dst));
+    WSP_GGML_ASSERT(wsp_ggml_can_repeat(src1, src0) && wsp_ggml_are_same_shape(src0, dst));
     if (params->type == WSP_GGML_TASK_INIT || params->type == WSP_GGML_TASK_FINALIZE) {
         return;
@@ -7727,8 +7759,10 @@ static void wsp_ggml_compute_forward_mul_f32(
     const int ith = params->ith;
     const int nth = params->nth;
+// TODO: OpenCL kernel support broadcast
 #ifdef WSP_GGML_USE_CLBLAST
     if (src1->backend == WSP_GGML_BACKEND_GPU) {
+        WSP_GGML_ASSERT(wsp_ggml_are_same_shape(src0, src1));
         if (ith == 0) {
             wsp_ggml_cl_mul(src0, src1, dst);
         }
@@ -7742,7 +7776,6 @@ static void wsp_ggml_compute_forward_mul_f32(
     WSP_GGML_ASSERT( nb0 == sizeof(float));
     WSP_GGML_ASSERT(nb00 == sizeof(float));
-    WSP_GGML_ASSERT(ne00 == ne10);
     if (nb10 == sizeof(float)) {
         for (int64_t ir = ith; ir < nr; ir += nth) {
@@ -7754,20 +7787,21 @@ static void wsp_ggml_compute_forward_mul_f32(
             const int64_t i13 = i03 % ne13;
             const int64_t i12 = i02 % ne12;
             const int64_t i11 = i01 % ne11;
+            const int64_t nr0 = ne00 / ne10;
             float * dst_ptr  = (float *) ((char *) dst->data  + i03*nb3  + i02*nb2  + i01*nb1 );
             float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
             float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11);
+            for (int64_t r = 0 ; r < nr0; ++r) {
 #ifdef WSP_GGML_USE_ACCELERATE
-            UNUSED(wsp_ggml_vec_mul_f32);
+                UNUSED(wsp_ggml_vec_mul_f32);
-            vDSP_vmul( src0_ptr, 1, src1_ptr, 1, dst_ptr,  1, ne00);
+                vDSP_vmul(src0_ptr + r*ne10, 1, src1_ptr, 1, dst_ptr + r*ne10, 1, ne10);
 #else
-            wsp_ggml_vec_mul_f32(ne00, dst_ptr, src0_ptr, src1_ptr);
+                wsp_ggml_vec_mul_f32(ne10, dst_ptr + r*ne10, src0_ptr + r*ne10, src1_ptr);
 #endif
-                // }
-            // }
+            }
         }
     } else {
         // src1 is not contiguous
@@ -7785,8 +7819,9 @@ static void wsp_ggml_compute_forward_mul_f32(
             float * dst_ptr  = (float *) ((char *) dst->data  + i03*nb3  + i02*nb2  + i01*nb1 );
             float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
-            for (int64_t i0 = 0; i0 < ne00; i0++) {
-                float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11 + i0*nb10);
+            for (int64_t i0 = 0; i0 < ne00; ++i0) {
+                const int64_t i10 = i0 % ne10;
+                float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11 + i10*nb10);
                 dst_ptr[i0] = src0_ptr[i0] * (*src1_ptr);
             }
@@ -7820,14 +7855,16 @@ static void wsp_ggml_compute_forward_div_f32(
         const struct wsp_ggml_tensor * src0,
         const struct wsp_ggml_tensor * src1,
         struct wsp_ggml_tensor * dst) {
-    assert(params->ith == 0);
-    assert(wsp_ggml_are_same_shape(src0, src1) && wsp_ggml_are_same_shape(src0, dst));
+    WSP_GGML_ASSERT(wsp_ggml_can_repeat(src1, src0) && wsp_ggml_are_same_shape(src0, dst));
     if (params->type == WSP_GGML_TASK_INIT || params->type == WSP_GGML_TASK_FINALIZE) {
         return;
     }
-    const int nr  = wsp_ggml_nrows(src0);
+    const int ith = params->ith;
+    const int nth = params->nth;
+    const int64_t nr = wsp_ggml_nrows(src0);
     WSP_GGML_TENSOR_BINARY_OP_LOCALS
@@ -7835,41 +7872,50 @@ static void wsp_ggml_compute_forward_div_f32(
     WSP_GGML_ASSERT(nb00 == sizeof(float));
     if (nb10 == sizeof(float)) {
-        for (int ir = 0; ir < nr; ++ir) {
-            // src0, src1 and dst are same shape => same indices
-            const int i3 = ir/(ne2*ne1);
-            const int i2 = (ir - i3*ne2*ne1)/ne1;
-            const int i1 = (ir - i3*ne2*ne1 - i2*ne1);
+        for (int64_t ir = ith; ir < nr; ir += nth) {
+            // src0 and dst are same shape => same indices
+            const int64_t i03 = ir/(ne02*ne01);
+            const int64_t i02 = (ir - i03*ne02*ne01)/ne01;
+            const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01);
-#ifdef WSP_GGML_USE_ACCELERATE
-            UNUSED(wsp_ggml_vec_div_f32);
+            const int64_t i13 = i03 % ne13;
+            const int64_t i12 = i02 % ne12;
+            const int64_t i11 = i01 % ne11;
+            const int64_t nr0 = ne00 / ne10;
-            vDSP_vdiv(
-                    (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11), 1,
-                    (float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01), 1,
-                    (float *) ((char *) dst->data  + i3*nb3  + i2*nb2  + i1*nb1 ), 1,
-                    ne0);
+            float * dst_ptr  = (float *) ((char *) dst->data  + i03*nb3  + i02*nb2  + i01*nb1 );
+            float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
+            float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11);
+            for (int64_t r = 0; r < nr0; ++r) {
+#ifdef WSP_GGML_USE_ACCELERATE
+                UNUSED(wsp_ggml_vec_div_f32);
+                vDSP_vdiv(src1_ptr, 1, src0_ptr + r*ne10, 1, dst_ptr + r*ne10, 1, ne10);
 #else
-            wsp_ggml_vec_div_f32(ne0,
-                    (float *) ((char *) dst->data  + i3*nb3  + i2*nb2  + i1*nb1 ),
-                    (float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01),
-                    (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11));
+                wsp_ggml_vec_div_f32(ne10, dst_ptr + r*ne10, src0_ptr + r*ne10, src1_ptr);
 #endif
-                // }
-            // }
+            }
         }
     } else {
         // src1 is not contiguous
-        for (int ir = 0; ir < nr; ++ir) {
-            // src0, src1 and dst are same shape => same indices
-            const int i3 = ir/(ne2*ne1);
-            const int i2 = (ir - i3*ne2*ne1)/ne1;
-            const int i1 = (ir - i3*ne2*ne1 - i2*ne1);
+        for (int64_t ir = ith; ir < nr; ir += nth) {
+            // src0 and dst are same shape => same indices
+            // src1 is broadcastable across src0 and dst in i1, i2, i3
+            const int64_t i03 = ir/(ne02*ne01);
+            const int64_t i02 = (ir - i03*ne02*ne01)/ne01;
+            const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01);
-            float * dst_ptr  = (float *) ((char *) dst->data  + i3*nb3  + i2*nb2  + i1*nb1 );
-            float * src0_ptr = (float *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01);
-            for (int i0 = 0; i0 < ne0; i0++) {
-                float * src1_ptr = (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11 + i0*nb10);
+            const int64_t i13 = i03 % ne13;
+            const int64_t i12 = i02 % ne12;
+            const int64_t i11 = i01 % ne11;
+            float * dst_ptr  = (float *) ((char *) dst->data  + i03*nb3  + i02*nb2  + i01*nb1 );
+            float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
+            for (int64_t i0 = 0; i0 < ne00; ++i0) {
+                const int64_t i10 = i0 % ne10;
+                float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11 + i10*nb10);
                 dst_ptr[i0] = src0_ptr[i0] / (*src1_ptr);
             }
@@ -8315,7 +8361,7 @@ static void wsp_ggml_compute_forward_repeat_f16(
         return;
     }
-    WSP_GGML_TENSOR_UNARY_OP_LOCALS;
+    WSP_GGML_TENSOR_UNARY_OP_LOCALS
     // guaranteed to be an integer due to the check in wsp_ggml_can_repeat
     const int nr0 = (int)(ne0/ne00);
@@ -8460,6 +8506,7 @@ static void wsp_ggml_compute_forward_concat_f32(
     WSP_GGML_ASSERT(src0->nb[0] == sizeof(float));
     const int ith = params->ith;
+    const int nth = params->nth;
     WSP_GGML_TENSOR_BINARY_OP_LOCALS
@@ -8469,7 +8516,7 @@ static void wsp_ggml_compute_forward_concat_f32(
     WSP_GGML_ASSERT(nb10 == sizeof(float));
     for (int i3 = 0; i3 < ne3; i3++) {
-        for (int i2 = ith; i2 < ne2; i2++) {
+        for (int i2 = ith; i2 < ne2; i2 += nth) {
             if (i2 < ne02) { // src0
                 for (int i1 = 0; i1 < ne1; i1++) {
                     for (int i0 = 0; i0 < ne0; i0++) {
@@ -8981,10 +9028,9 @@ static void wsp_ggml_compute_forward_silu(
             } break;
     }
 }
+// wsp_ggml_compute_forward_leaky_relu
-// wsp_ggml_compute_forward_leaky
-static void wsp_ggml_compute_forward_leaky_f32(
+static void wsp_ggml_compute_forward_leaky_relu_f32(
         const struct wsp_ggml_compute_params * params,
         const struct wsp_ggml_tensor * src0,
         struct wsp_ggml_tensor * dst) {
@@ -8998,24 +9044,27 @@ static void wsp_ggml_compute_forward_leaky_f32(
     const int n  = wsp_ggml_nrows(src0);
     const int nc = src0->ne[0];
+    float negative_slope;
+    memcpy(&negative_slope, dst->op_params, sizeof(float));
     assert(dst->nb[0]  == sizeof(float));
     assert(src0->nb[0] == sizeof(float));
     for (int i = 0; i < n; i++) {
-        wsp_ggml_vec_leaky_f32(nc,
+        wsp_ggml_vec_leaky_relu_f32(nc,
                 (float *) ((char *) dst->data  + i*( dst->nb[1])),
-                (float *) ((char *) src0->data + i*(src0->nb[1])));
+                (float *) ((char *) src0->data + i*(src0->nb[1])), negative_slope);
     }
 }
-static void wsp_ggml_compute_forward_leaky(
+static void wsp_ggml_compute_forward_leaky_relu(
         const struct wsp_ggml_compute_params * params,
         const struct wsp_ggml_tensor * src0,
         struct wsp_ggml_tensor * dst) {
     switch (src0->type) {
         case WSP_GGML_TYPE_F32:
             {
-                wsp_ggml_compute_forward_leaky_f32(params, src0, dst);
+                wsp_ggml_compute_forward_leaky_relu_f32(params, src0, dst);
             } break;
         default:
             {
@@ -9504,9 +9553,14 @@ static bool wsp_ggml_compute_forward_mul_mat_use_blas(
     const int64_t ne0 = dst->ne[0];
     const int64_t ne1 = dst->ne[1];
+    // NOTE: with WSP_GGML_OP_MUL_MAT_ID we don't want to go through the BLAS branch because it will dequantize (to_float)
+    //       all the experts for each batch element and the processing would become incredibly slow
     // TODO: find the optimal values for these
-    if (wsp_ggml_is_contiguous(src0) &&
+    if (dst->op != WSP_GGML_OP_MUL_MAT_ID &&
+        wsp_ggml_is_contiguous(src0) &&
         wsp_ggml_is_contiguous(src1) &&
+      //src0->type == WSP_GGML_TYPE_F32 &&
+        src1->type == WSP_GGML_TYPE_F32 &&
         (ne0 >= 32 && ne1 >= 32 && ne10 >= 32)) {
         /*printf("BLAS: %d %d %d %d %d\n", ne0, ne1, ne10, ne00, ne01);*/
@@ -9517,11 +9571,16 @@ static bool wsp_ggml_compute_forward_mul_mat_use_blas(
 }
 #endif
+// off1 = offset in i11 and i1
+// cne1 = ne11 and ne1
+// in a normal matrix multiplication, off1 = 0 and cne1 = ne1
+// during WSP_GGML_TASK_INIT, the full src1 is converted regardless of off1 and cne1
 static void wsp_ggml_compute_forward_mul_mat(
         const struct wsp_ggml_compute_params * params,
         const struct wsp_ggml_tensor * src0,
         const struct wsp_ggml_tensor * src1,
-              struct wsp_ggml_tensor * dst) {
+              struct wsp_ggml_tensor * dst,
+              int64_t off1, int64_t cne1) {
     int64_t t0 = wsp_ggml_perf_time_us();
     UNUSED(t0);
@@ -9545,7 +9604,7 @@ static void wsp_ggml_compute_forward_mul_mat(
     // we don't support permuted src0 or src1
     WSP_GGML_ASSERT(nb00 == wsp_ggml_type_size(type));
-    WSP_GGML_ASSERT(nb10 == sizeof(float));
+    WSP_GGML_ASSERT(nb10 == wsp_ggml_type_size(src1->type));
     // dst cannot be transposed or permuted
     WSP_GGML_ASSERT(nb0 == sizeof(float));
@@ -9589,10 +9648,9 @@ static void wsp_ggml_compute_forward_mul_mat(
                 const int64_t i03 = i13/r3;
                 const int64_t i02 = i12/r2;
-                const void  * x = (char *)            src0->data + i02*nb02 + i03*nb03;
-                const float * y = (float *) ((char *) src1->data + i12*nb12 + i13*nb13);
-                float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
+                const void  * x = (char *)            src0->data +             i02*nb02 + i03*nb03;
+                const float * y = (float *) ((char *) src1->data + off1*nb11 + i12*nb12 + i13*nb13);
+                      float * d = (float *) ((char *)  dst->data + off1*nb1  + i12*nb2  + i13*nb3);
                 if (type != WSP_GGML_TYPE_F32) {
                             float * const wdata    = params->wdata;
@@ -9609,10 +9667,10 @@ static void wsp_ggml_compute_forward_mul_mat(
                 }
                 cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
-                        ne11, ne01, ne10,
-                        1.0f,    y, ne10,
-                                 x, ne00,
-                        0.0f,    d, ne01);
+                         cne1, ne01, ne10,
+                         1.0f,    y, ne10,
+                                  x, ne00,
+                         0.0f,    d, ne01);
             }
         }
@@ -9627,6 +9685,9 @@ static void wsp_ggml_compute_forward_mul_mat(
             char * wdata = params->wdata;
             const size_t row_size = ne10*wsp_ggml_type_size(vec_dot_type)/wsp_ggml_blck_size(vec_dot_type);
+            assert(params->wsize >= ne11*ne12*ne13*row_size);
+            assert(src1->type == WSP_GGML_TYPE_F32);
             for (int64_t i13 = 0; i13 < ne13; ++i13) {
                 for (int64_t i12 = 0; i12 < ne12; ++i12) {
                     for (int64_t i11 = 0; i11 < ne11; ++i11) {
@@ -9648,7 +9709,7 @@ static void wsp_ggml_compute_forward_mul_mat(
     const size_t row_size = ne10*wsp_ggml_type_size(vec_dot_type)/wsp_ggml_blck_size(vec_dot_type);
     const int64_t nr0 = ne01;           // src0 rows
-    const int64_t nr1 = ne11*ne12*ne13; // src1 rows
+    const int64_t nr1 = cne1*ne12*ne13; // src1 rows
     //printf("nr0 = %lld, nr1 = %lld\n", nr0, nr1);
@@ -9690,9 +9751,9 @@ static void wsp_ggml_compute_forward_mul_mat(
     for (int64_t iir1 = ir110; iir1 < ir111; iir1 += blck_1) {
         for (int64_t iir0 = ir010; iir0 < ir011; iir0 += blck_0) {
             for (int64_t ir1 = iir1; ir1 < iir1 + blck_1 && ir1 < ir111; ++ir1) {
-                const int64_t i13 = (ir1/(ne12*ne11));
-                const int64_t i12 = (ir1 - i13*ne12*ne11)/ne11;
-                const int64_t i11 = (ir1 - i13*ne12*ne11 - i12*ne11);
+                const int64_t i13 = (ir1/(ne12*cne1));
+                const int64_t i12 = (ir1 - i13*ne12*cne1)/cne1;
+                const int64_t i11 = (ir1 - i13*ne12*cne1 - i12*cne1) + off1;
                 // broadcast src0 into src1
                 const int64_t i03 = i13/r3;
@@ -9728,6 +9789,34 @@ static void wsp_ggml_compute_forward_mul_mat(
     }
 }
+// wsp_ggml_compute_forward_mul_mat_id
+static void wsp_ggml_compute_forward_mul_mat_id(
+        const struct wsp_ggml_compute_params * params,
+        const struct wsp_ggml_tensor * src0,
+        const struct wsp_ggml_tensor * src1,
+              struct wsp_ggml_tensor * dst) {
+    if (params->type == WSP_GGML_TASK_INIT || params->type == WSP_GGML_TASK_FINALIZE) {
+        // during WSP_GGML_TASK_INIT the entire src1 is converted to vec_dot_type
+        wsp_ggml_compute_forward_mul_mat(params, dst->src[2], src1, dst, 0, dst->ne[1]);
+        return;
+    }
+    const struct wsp_ggml_tensor * ids = src0;
+    const int id   = wsp_ggml_get_op_params_i32(dst, 0);
+    const int n_as = wsp_ggml_get_op_params_i32(dst, 1);
+    for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
+        const int32_t row_id = *(const int32_t *) ((const char *) ids->data + i01*ids->nb[1] + id*ids->nb[0]);
+        WSP_GGML_ASSERT(row_id >= 0 && row_id < n_as);
+        const struct wsp_ggml_tensor * src0_row = dst->src[row_id + 2];
+        wsp_ggml_compute_forward_mul_mat(params, src0_row, src1, dst, i01, 1);
+    }
+}
 // wsp_ggml_compute_forward_out_prod
 static void wsp_ggml_compute_forward_out_prod_f32(
@@ -9743,10 +9832,12 @@ static void wsp_ggml_compute_forward_out_prod_f32(
     const int ith = params->ith;
     const int nth = params->nth;
+    WSP_GGML_ASSERT(ne0  == ne00);
+    WSP_GGML_ASSERT(ne1  == ne10);
+    WSP_GGML_ASSERT(ne2  == ne02);
     WSP_GGML_ASSERT(ne02 == ne12);
-    WSP_GGML_ASSERT(ne03 == ne13);
-    WSP_GGML_ASSERT(ne2  == ne12);
     WSP_GGML_ASSERT(ne3  == ne13);
+    WSP_GGML_ASSERT(ne03 == ne13);
     // we don't support permuted src0 or src1
     WSP_GGML_ASSERT(nb00 == sizeof(float));
@@ -9757,18 +9848,25 @@ static void wsp_ggml_compute_forward_out_prod_f32(
     // WSP_GGML_ASSERT(nb1 <= nb2);
     // WSP_GGML_ASSERT(nb2 <= nb3);
-    WSP_GGML_ASSERT(ne0 == ne00);
-    WSP_GGML_ASSERT(ne1 == ne10);
-    WSP_GGML_ASSERT(ne2 == ne02);
-    WSP_GGML_ASSERT(ne3 == ne03);
     // nb01 >= nb00 - src0 is not transposed
     //   compute by src0 rows
     // TODO: #if defined(WSP_GGML_USE_CUBLAS) wsp_ggml_cuda_out_prod
-    // TODO: #if defined(WSP_GGML_USE_ACCELERATE) || defined(WSP_GGML_USE_OPENBLAS) || defined(WSP_GGML_USE_CLBLAST)
+    // TODO: #if defined(WSP_GGML_USE_CLBLAST)
+#if defined(WSP_GGML_USE_ACCELERATE) || defined(WSP_GGML_USE_OPENBLAS)
+    bool use_blas = wsp_ggml_is_matrix(src0) &&
+        wsp_ggml_is_matrix(src1) &&
+        wsp_ggml_is_contiguous(src0) &&
+        (wsp_ggml_is_contiguous(src1) || wsp_ggml_is_transposed(src1));
+#endif
     if (params->type == WSP_GGML_TASK_INIT) {
+#if defined(WSP_GGML_USE_ACCELERATE) || defined(WSP_GGML_USE_OPENBLAS) // gemm beta will zero dst
+        if (use_blas) {
+            return;
+        }
+#endif
         wsp_ggml_vec_set_f32(ne0*ne1*ne2*ne3, dst->data, 0);
         return;
     }
@@ -9777,6 +9875,50 @@ static void wsp_ggml_compute_forward_out_prod_f32(
         return;
     }
+#if defined(WSP_GGML_USE_ACCELERATE) || defined(WSP_GGML_USE_OPENBLAS)
+    if (use_blas) {
+        if (params->ith != 0) { // All threads other than the first do no work.
+            return;
+        }
+        // Arguments to wsp_ggml_compute_forward_out_prod (expressed as major,minor)
+        // src0: (k,n)
+        // src1: (k,m)
+        // dst:  (m,n)
+        //
+        // Arguments to sgemm (see https://github.com/Reference-LAPACK/lapack/blob/master/BLAS/SRC/sgemm.f)
+        // Also expressed as (major,minor)
+        // a: (m,k): so src1 transposed
+        // b: (k,n): so src0
+        // c: (m,n)
+        //
+        // However, if wsp_ggml_is_transposed(src1) is true, then
+        // src1->data already contains a transposed version, so sgemm mustn't
+        // transpose it further.
+        int n = src0->ne[0];
+        int k = src0->ne[1];
+        int m = src1->ne[0];
+        int transposeA, lda;
+        if (!wsp_ggml_is_transposed(src1)) {
+            transposeA = CblasTrans;
+            lda = m;
+        } else {
+            transposeA = CblasNoTrans;
+            lda = k;
+        }
+        float * a = (float *) ((char *) src1->data);
+        float * b = (float *) ((char *) src0->data);
+        float * c = (float *) ((char *) dst->data);
+        cblas_sgemm(CblasRowMajor, transposeA, CblasNoTrans, m, n, k, 1.0, a, lda, b, n, 0.0, c, n);
+        return;
+    }
+#endif
     // dst[:,:,:,:] = 0
     // for i2,i3:
     //   for i1:
@@ -9880,7 +10022,7 @@ static void wsp_ggml_compute_forward_out_prod_q_f32(
     const int nth = params->nth;
     const enum wsp_ggml_type type = src0->type;
-    wsp_ggml_to_float_t const dequantize_row_q = type_traits[type].to_float;
+    wsp_ggml_to_float_t const wsp_dewsp_quantize_row_q = type_traits[type].to_float;
     WSP_GGML_ASSERT(ne02 == ne12);
     WSP_GGML_ASSERT(ne03 == ne13);
@@ -9957,7 +10099,7 @@ static void wsp_ggml_compute_forward_out_prod_q_f32(
             float * s1 = (float *) ((char *) src1->data + (i1*nb10 + i11*nb11 + i12*nb12 + i13*nb13));
             float * d  = (float *) ((char *)  dst->data + (          i1*nb1 + i2*nb2 + i3*nb3));
-            dequantize_row_q(s0, wdata, ne0);
+            wsp_dewsp_quantize_row_q(s0, wdata, ne0);
             wsp_ggml_vec_mad_f32(ne0, d, wdata, *s1);
         }
     }
@@ -10084,7 +10226,7 @@ static void wsp_ggml_compute_forward_set_f32(
     WSP_GGML_ASSERT(wsp_ggml_is_contiguous(dst) && wsp_ggml_is_contiguous(src0));
     // view src0 and dst with these strides and data offset inbytes during set
-    // nb0 is implicitely element_size because src0 and dst are contiguous
+    // nb0 is implicitly element_size because src0 and dst are contiguous
     size_t nb1     = ((int32_t *) dst->op_params)[0];
     size_t nb2     = ((int32_t *) dst->op_params)[1];
     size_t nb3     = ((int32_t *) dst->op_params)[2];
@@ -10248,21 +10390,30 @@ static void wsp_ggml_compute_forward_get_rows_q(
         return;
     }
-    const int nc = src0->ne[0];
-    const int nr = wsp_ggml_nelements(src1);
-    const enum wsp_ggml_type type = src0->type;
-    wsp_ggml_to_float_t const dequantize_row_q = type_traits[type].to_float;
-    assert( dst->ne[0] == nc);
-    assert( dst->ne[1] == nr);
-    assert(src0->nb[0] == wsp_ggml_type_size(type));
+    WSP_GGML_TENSOR_BINARY_OP_LOCALS
-    for (int i = 0; i < nr; ++i) {
-        const int r = ((int32_t *) src1->data)[i];
+    const int64_t nc = ne00;
+    const int64_t nr = wsp_ggml_nelements(src1); WSP_GGML_UNUSED(nr);
-        dequantize_row_q(
-                (const void *) ((char *) src0->data + r*src0->nb[1]),
-                     (float *) ((char *)  dst->data + i*dst->nb[1]), nc);
+    const enum wsp_ggml_type type = src0->type;
+    wsp_ggml_to_float_t const wsp_dewsp_quantize_row_q = type_traits[type].to_float;
+    assert(ne0  == nc);
+    assert(ne02 == ne11);
+    assert(nb00 == wsp_ggml_type_size(type));
+    assert(wsp_ggml_nrows(dst) == nr);
+    // TODO: multi-thread
+    for (int64_t i12 = 0; i12 < ne12; ++i12) {
+        for (int64_t i11 = 0; i11 < ne11; ++i11) {
+            for (int64_t i10 = 0; i10 < ne10; ++i10) {
+                const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);
+                wsp_dewsp_quantize_row_q(
+                        (const void *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03),
+                             (float *) ((char *)  dst->data + i10*nb1  + i11*nb2  + i12*nb3), nc);
+            }
+        }
     }
 }
@@ -10277,19 +10428,26 @@ static void wsp_ggml_compute_forward_get_rows_f16(
         return;
     }
-    const int nc = src0->ne[0];
-    const int nr = wsp_ggml_nelements(src1);
+    WSP_GGML_TENSOR_BINARY_OP_LOCALS
-    assert( dst->ne[0] == nc);
-    assert( dst->ne[1] == nr);
-    assert(src0->nb[0] == sizeof(wsp_ggml_fp16_t));
+    const int64_t nc = ne00;
+    const int64_t nr = wsp_ggml_nelements(src1); WSP_GGML_UNUSED(nr);
-    for (int i = 0; i < nr; ++i) {
-        const int r = ((int32_t *) src1->data)[i];
+    assert(ne0  == nc);
+    assert(ne02 == ne11);
+    assert(nb00 == sizeof(wsp_ggml_fp16_t));
+    assert(wsp_ggml_nrows(dst) == nr);
-        for (int j = 0; j < nc; ++j) {
-            wsp_ggml_fp16_t v = ((wsp_ggml_fp16_t *) ((char *) src0->data + r*src0->nb[1]))[j];
-            ((float *) ((char *)  dst->data + i*dst->nb[1]))[j] = WSP_GGML_FP16_TO_FP32(v);
+    // TODO: multi-thread
+    for (int64_t i12 = 0; i12 < ne12; ++i12) {
+        for (int64_t i11 = 0; i11 < ne11; ++i11) {
+            for (int64_t i10 = 0; i10 < ne10; ++i10) {
+                const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);
+                wsp_ggml_fp16_to_fp32_row(
+                        (const void *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03),
+                             (float *) ((char *)  dst->data + i10*nb1  + i11*nb2  + i12*nb3), nc);
+            }
         }
     }
 }
@@ -10305,19 +10463,27 @@ static void wsp_ggml_compute_forward_get_rows_f32(
         return;
     }
-    const int nc = src0->ne[0];
-    const int nr = wsp_ggml_nelements(src1);
+    WSP_GGML_TENSOR_BINARY_OP_LOCALS
-    assert( dst->ne[0] == nc);
-    assert( dst->ne[1] == nr);
-    assert(src0->nb[0] == sizeof(float));
+    const int64_t nc = ne00;
+    const int64_t nr = wsp_ggml_nelements(src1); WSP_GGML_UNUSED(nr);
-    for (int i = 0; i < nr; ++i) {
-        const int r = ((int32_t *) src1->data)[i];
+    assert(ne0  == nc);
+    assert(ne02 == ne11);
+    assert(nb00 == sizeof(float));
+    assert(wsp_ggml_nrows(dst) == nr);
-        wsp_ggml_vec_cpy_f32(nc,
-                (float *) ((char *)  dst->data + i*dst->nb[1]),
-                (float *) ((char *) src0->data + r*src0->nb[1]));
+    // TODO: multi-thread
+    for (int64_t i12 = 0; i12 < ne12; ++i12) {
+        for (int64_t i11 = 0; i11 < ne11; ++i11) {
+            for (int64_t i10 = 0; i10 < ne10; ++i10) {
+                const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);
+                wsp_ggml_vec_cpy_f32(nc,
+                        (float *) ((char *)  dst->data + i10*nb1  + i11*nb2  + i12*nb3),
+                        (float *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03));
+            }
+        }
     }
 }
@@ -10630,20 +10796,25 @@ static void wsp_ggml_compute_forward_diag_mask_zero(
 static void wsp_ggml_compute_forward_soft_max_f32(
         const struct wsp_ggml_compute_params * params,
         const struct wsp_ggml_tensor * src0,
-        struct wsp_ggml_tensor * dst) {
-    WSP_GGML_ASSERT(wsp_ggml_is_contiguous(src0));
-    WSP_GGML_ASSERT(wsp_ggml_is_contiguous(dst));
-    WSP_GGML_ASSERT(wsp_ggml_are_same_shape(src0, dst));
+        const struct wsp_ggml_tensor * src1,
+              struct wsp_ggml_tensor * dst) {
+    assert(wsp_ggml_is_contiguous(dst));
+    assert(wsp_ggml_are_same_shape(src0, dst));
     if (params->type == WSP_GGML_TASK_INIT || params->type == WSP_GGML_TASK_FINALIZE) {
         return;
     }
+    float scale = 1.0f;
+    memcpy(&scale, (float *) dst->op_params + 0, sizeof(float));
     // TODO: handle transposed/permuted matrices
     const int ith = params->ith;
     const int nth = params->nth;
+    const int64_t ne11 = src1 ? src1->ne[1] : 1;
     const int nc = src0->ne[0];
     const int nr = wsp_ggml_nrows(src0);
@@ -10654,29 +10825,40 @@ static void wsp_ggml_compute_forward_soft_max_f32(
     const int ir0 = dr*ith;
     const int ir1 = MIN(ir0 + dr, nr);
+    float * wp = (float *) params->wdata + (nc + CACHE_LINE_SIZE_F32) * ith;
     for (int i1 = ir0; i1 < ir1; i1++) {
-        float *sp = (float *)((char *) src0->data + i1*src0->nb[1]);
-        float *dp = (float *)((char *)  dst->data +  i1*dst->nb[1]);
+        float * sp = (float *)((char *) src0->data + i1*src0->nb[1]);
+        float * dp = (float *)((char *)  dst->data +  i1*dst->nb[1]);
+        // broadcast the mask across rows
+        float * mp = src1 ? (float *)((char *) src1->data + (i1%ne11)*src1->nb[1]) : NULL;
+        wsp_ggml_vec_cpy_f32  (nc, wp, sp);
+        wsp_ggml_vec_scale_f32(nc, wp, scale);
+        if (mp) {
+            wsp_ggml_vec_acc_f32(nc, wp, mp);
+        }
 #ifndef NDEBUG
         for (int i = 0; i < nc; ++i) {
             //printf("p[%d] = %f\n", i, p[i]);
-            assert(!isnan(sp[i]));
+            assert(!isnan(wp[i]));
         }
 #endif
         float max = -INFINITY;
-        wsp_ggml_vec_max_f32(nc, &max, sp);
+        wsp_ggml_vec_max_f32(nc, &max, wp);
         wsp_ggml_float sum = 0.0;
         uint16_t scvt;
         for (int i = 0; i < nc; i++) {
-            if (sp[i] == -INFINITY) {
+            if (wp[i] == -INFINITY) {
                 dp[i] = 0.0f;
             } else {
-                // const float val = (sp[i] == -INFINITY) ? 0.0 : exp(sp[i] - max);
-                wsp_ggml_fp16_t s = WSP_GGML_FP32_TO_FP16(sp[i] - max);
+                // const float val = (wp[i] == -INFINITY) ? 0.0 : exp(wp[i] - max);
+                wsp_ggml_fp16_t s = WSP_GGML_FP32_TO_FP16(wp[i] - max);
                 memcpy(&scvt, &s, sizeof(scvt));
                 const float val = WSP_GGML_FP16_TO_FP32(wsp_ggml_table_exp_f16[scvt]);
                 sum += (wsp_ggml_float)val;
@@ -10701,11 +10883,12 @@ static void wsp_ggml_compute_forward_soft_max_f32(
 static void wsp_ggml_compute_forward_soft_max(
         const struct wsp_ggml_compute_params * params,
         const struct wsp_ggml_tensor * src0,
-        struct wsp_ggml_tensor * dst) {
+        const struct wsp_ggml_tensor * src1,
+              struct wsp_ggml_tensor * dst) {
     switch (src0->type) {
         case WSP_GGML_TYPE_F32:
             {
-                wsp_ggml_compute_forward_soft_max_f32(params, src0, dst);
+                wsp_ggml_compute_forward_soft_max_f32(params, src0, src1, dst);
             } break;
         default:
             {
@@ -11086,7 +11269,8 @@ static void wsp_ggml_compute_forward_rope_f32(
         const struct wsp_ggml_compute_params * params,
         const struct wsp_ggml_tensor * src0,
         const struct wsp_ggml_tensor * src1,
-        struct wsp_ggml_tensor * dst) {
+        struct wsp_ggml_tensor * dst,
+        const bool forward) {
     if (params->type == WSP_GGML_TASK_INIT || params->type == WSP_GGML_TASK_FINALIZE) {
         return;
     }
@@ -11145,6 +11329,11 @@ static void wsp_ggml_compute_forward_rope_f32(
     const bool is_neox = mode & 2;
     const bool is_glm  = mode & 4;
+    // backward process uses inverse rotation by cos and sin.
+    // cos and sin build a rotation matrix, where the inverse is the transpose.
+    // this essentially just switches the sign of sin.
+    const float sin_sign = forward ? 1.0f : -1.0f;
     const int32_t * pos = (const int32_t *) src1->data;
     for (int64_t i3 = 0; i3 < ne3; i3++) {
@@ -11161,9 +11350,9 @@ static void wsp_ggml_compute_forward_rope_f32(
                     float block_theta = MAX(p - (n_ctx - 2), 0);
                     for (int64_t i0 = 0; i0 < ne0 / 4; i0++) {
                         const float cos_theta = cosf(theta_base);
-                        const float sin_theta = sinf(theta_base);
+                        const float sin_theta = sinf(theta_base) * sin_sign;
                         const float cos_block_theta = cosf(block_theta);
-                        const float sin_block_theta = sinf(block_theta);
+                        const float sin_block_theta = sinf(block_theta) * sin_sign;
                         theta_base *= theta_scale;
                         block_theta *= theta_scale;
@@ -11187,6 +11376,7 @@ static void wsp_ggml_compute_forward_rope_f32(
                         rope_yarn(
                             theta_base, freq_scale, corr_dims, i0, ext_factor, attn_factor, &cos_theta, &sin_theta
                         );
+                        sin_theta *= sin_sign;
                         // zeta scaling for xPos only:
                         float zeta = xpos_base != 0.0f ? powf((i0 + 0.4f * ne0) / (1.4f * ne0), p / xpos_base) : 1.0f;
@@ -11217,6 +11407,7 @@ static void wsp_ggml_compute_forward_rope_f32(
                                 theta_base, freq_scale, corr_dims, cur_rot, ext_factor, attn_factor,
                                 &cos_theta, &sin_theta
                             );
+                            sin_theta *= sin_sign;
                             theta_base *= theta_scale;
@@ -11242,7 +11433,8 @@ static void wsp_ggml_compute_forward_rope_f16(
         const struct wsp_ggml_compute_params * params,
         const struct wsp_ggml_tensor * src0,
         const struct wsp_ggml_tensor * src1,
-        struct wsp_ggml_tensor * dst) {
+        struct wsp_ggml_tensor * dst,
+        const bool forward) {
     if (params->type == WSP_GGML_TASK_INIT || params->type == WSP_GGML_TASK_FINALIZE) {
         return;
     }
@@ -11294,6 +11486,11 @@ static void wsp_ggml_compute_forward_rope_f16(
     const bool is_neox = mode & 2;
     const bool is_glm  = mode & 4;
+    // backward process uses inverse rotation by cos and sin.
+    // cos and sin build a rotation matrix, where the inverse is the transpose.
+    // this essentially just switches the sign of sin.
+    const float sin_sign = forward ? 1.0f : -1.0f;
     const int32_t * pos = (const int32_t *) src1->data;
     for (int64_t i3 = 0; i3 < ne3; i3++) {
@@ -11310,9 +11507,9 @@ static void wsp_ggml_compute_forward_rope_f16(
                     float block_theta = MAX(p - (n_ctx - 2), 0);
                     for (int64_t i0 = 0; i0 < ne0 / 4; i0++) {
                         const float cos_theta = cosf(theta_base);
-                        const float sin_theta = sinf(theta_base);
+                        const float sin_theta = sinf(theta_base) * sin_sign;
                         const float cos_block_theta = cosf(block_theta);
-                        const float sin_block_theta = sinf(block_theta);
+                        const float sin_block_theta = sinf(block_theta) * sin_sign;
                         theta_base *= theta_scale;
                         block_theta *= theta_scale;
@@ -11336,6 +11533,7 @@ static void wsp_ggml_compute_forward_rope_f16(
                         rope_yarn(
                             theta_base, freq_scale, corr_dims, i0, ext_factor, attn_factor, &cos_theta, &sin_theta
                         );
+                        sin_theta *= sin_sign;
                         theta_base *= theta_scale;
@@ -11362,6 +11560,7 @@ static void wsp_ggml_compute_forward_rope_f16(
                                 theta_base, freq_scale, corr_dims, cur_rot, ext_factor, attn_factor,
                                 &cos_theta, &sin_theta
                             );
+                            sin_theta *= sin_sign;
                             theta_base *= theta_scale;
@@ -11391,11 +11590,11 @@ static void wsp_ggml_compute_forward_rope(
     switch (src0->type) {
         case WSP_GGML_TYPE_F16:
             {
-                wsp_ggml_compute_forward_rope_f16(params, src0, src1, dst);
+                wsp_ggml_compute_forward_rope_f16(params, src0, src1, dst, true);
             } break;
         case WSP_GGML_TYPE_F32:
             {
-                wsp_ggml_compute_forward_rope_f32(params, src0, src1, dst);
+                wsp_ggml_compute_forward_rope_f32(params, src0, src1, dst, true);
             } break;
         default:
             {
@@ -11406,726 +11605,106 @@ static void wsp_ggml_compute_forward_rope(
 // wsp_ggml_compute_forward_rope_back
-static void wsp_ggml_compute_forward_rope_back_f32(
+static void wsp_ggml_compute_forward_rope_back(
         const struct wsp_ggml_compute_params * params,
         const struct wsp_ggml_tensor * src0,
         const struct wsp_ggml_tensor * src1,
         struct wsp_ggml_tensor * dst) {
-    if (params->type == WSP_GGML_TASK_INIT || params->type == WSP_GGML_TASK_FINALIZE) {
-        return;
+    switch (src0->type) {
+        case WSP_GGML_TYPE_F16:
+            {
+                wsp_ggml_compute_forward_rope_f16(params, src0, src1, dst, false);
+            } break;
+        case WSP_GGML_TYPE_F32:
+            {
+                wsp_ggml_compute_forward_rope_f32(params, src0, src1, dst, false);
+            } break;
+        default:
+            {
+                WSP_GGML_ASSERT(false);
+            } break;
     }
+}
-    // y = rope(x, src1)
-    // dx = rope_back(dy, src1)
-    // src0 is dy, src1 contains options
-    float freq_base;
-    float freq_scale;
-    // these two only relevant for xPos RoPE:
-    float xpos_base;
-    bool xpos_down;
-    //const int n_past = ((int32_t *) dst->op_params)[0];
-    const int n_dims = ((int32_t *) dst->op_params)[1];
-    const int mode   = ((int32_t *) dst->op_params)[2];
-    const int n_ctx  = ((int32_t *) dst->op_params)[3]; UNUSED(n_ctx);
-    memcpy(&freq_base,  (int32_t *) dst->op_params + 4, sizeof(float));
-    memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float));
-    memcpy(&xpos_base,  (int32_t *) dst->op_params + 6, sizeof(float));
-    memcpy(&xpos_down,  (int32_t *) dst->op_params + 7, sizeof(bool));
+// wsp_ggml_compute_forward_conv_transpose_1d
-    WSP_GGML_TENSOR_UNARY_OP_LOCALS
+static void wsp_ggml_compute_forward_conv_transpose_1d_f16_f32(
+        const struct wsp_ggml_compute_params * params,
+        const struct wsp_ggml_tensor * src0,
+        const struct wsp_ggml_tensor * src1,
+              struct wsp_ggml_tensor * dst) {
+    WSP_GGML_ASSERT(src0->type == WSP_GGML_TYPE_F16);
+    WSP_GGML_ASSERT(src1->type == WSP_GGML_TYPE_F32);
+    WSP_GGML_ASSERT( dst->type == WSP_GGML_TYPE_F32);
-    //printf("ne0: %d, ne1: %d, ne2: %d, ne3: %d\n", ne0, ne1, ne2, ne3);
-    //printf("n_past = %d, ne2 = %d\n", n_past, ne2);
+    int64_t t0 = wsp_ggml_perf_time_us();
+    UNUSED(t0);
-    assert(nb0 == sizeof(float));
+    WSP_GGML_TENSOR_BINARY_OP_LOCALS
     const int ith = params->ith;
     const int nth = params->nth;
-    const int nr = wsp_ggml_nrows(dst);
-    // rows per thread
-    const int dr = (nr + nth - 1)/nth;
+    const int nk = ne00*ne01*ne02;
-    // row range for this thread
-    const int ir0 = dr*ith;
-    const int ir1 = MIN(ir0 + dr, nr);
+    WSP_GGML_ASSERT(nb00 == sizeof(wsp_ggml_fp16_t));
+    WSP_GGML_ASSERT(nb10 == sizeof(float));
-    // row index used to determine which thread to use
-    int ir = 0;
+    if (params->type == WSP_GGML_TASK_INIT) {
+        memset(params->wdata, 0, params->wsize);
-    const float theta_scale = powf(freq_base, -2.0f/n_dims);
+        // permute kernel data (src0) from (K x Cout x Cin) to (Cin x K x Cout)
+        {
+            wsp_ggml_fp16_t * const wdata = (wsp_ggml_fp16_t *) params->wdata + 0;
-    const bool is_neox = mode & 2;
+            for (int64_t i02 = 0; i02 < ne02; i02++) {
+                for (int64_t i01 = 0; i01 < ne01; i01++) {
+                    const wsp_ggml_fp16_t * const src = (wsp_ggml_fp16_t *)((char *) src0->data + i02*nb02 + i01*nb01);
+                    wsp_ggml_fp16_t * dst_data = wdata + i01*ne00*ne02;
+                    for (int64_t i00 = 0; i00 < ne00; i00++) {
+                        dst_data[i00*ne02 + i02] = src[i00];
+                    }
+                }
+            }
+        }
-    const int32_t * pos = (const int32_t *) src1->data;
+        // permute source data (src1) from (L x Cin) to (Cin x L)
+        {
+            wsp_ggml_fp16_t * const wdata = (wsp_ggml_fp16_t *) params->wdata + nk;
+            wsp_ggml_fp16_t * dst_data = wdata;
-    for (int64_t i3 = 0; i3 < ne3; i3++) {
-        for (int64_t i2 = 0; i2 < ne2; i2++) {
-            const int64_t p = pos[i2];
-            for (int64_t i1 = 0; i1 < ne1; i1++) {
-                if (ir++ < ir0) continue;
-                if (ir   > ir1) break;
+            for (int64_t i11 = 0; i11 < ne11; i11++) {
+                const float * const src = (float *)((char *) src1->data + i11*nb11);
+                for (int64_t i10 = 0; i10 < ne10; i10++) {
+                    dst_data[i10*ne11 + i11] = WSP_GGML_FP32_TO_FP16(src[i10]);
+                }
+            }
+        }
-                float theta_base = freq_scale * (float)p;
+        // need to zero dst since we are accumulating into it
+        memset(dst->data, 0, wsp_ggml_nbytes(dst));
-                if (!is_neox) {
-                    for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
-                        const float cos_theta = cosf(theta_base);
-                        const float sin_theta = sinf(theta_base);
+        return;
+    }
-                        // zeta scaling for xPos only:
-                        float zeta = xpos_base != 0.0f ? powf((i0 + 0.4f * ne0) / (1.4f * ne0), p / xpos_base) : 1.0f;
-                        if (xpos_down) zeta = 1.0f / zeta;
+    if (params->type == WSP_GGML_TASK_FINALIZE) {
+        return;
+    }
-                        theta_base *= theta_scale;
+    const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
-                        const float * const dy  = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
-                              float *       dx  = (float *)((char *)  dst->data + i3*nb3  + i2*nb2  + i1*nb1  + i0*nb0);
+    // total rows in dst
+    const int nr = ne1;
-                        const float dy0 = dy[0];
-                        const float dy1 = dy[1];
+    // rows per thread
+    const int dr = (nr + nth - 1)/nth;
-                        dx[0] =   dy0*cos_theta*zeta + dy1*sin_theta*zeta;
-                        dx[1] = - dy0*sin_theta*zeta + dy1*cos_theta*zeta;
-                    }
-                } else {
-                    for (int64_t ib = 0; ib < ne0/n_dims; ++ib) {
-                        for (int64_t ic = 0; ic < n_dims; ic += 2) {
-                            const float cos_theta = cosf(theta_base);
-                            const float sin_theta = sinf(theta_base);
+    // row range for this thread
+    const int ir0 = dr*ith;
+    const int ir1 = MIN(ir0 + dr, nr);
-                            theta_base *= theta_scale;
-                            const int64_t i0 = ib*n_dims + ic/2;
-                            const float * const dy  = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
-                                  float *       dx  = (float *)((char *)  dst->data + i3*nb3  + i2*nb2  + i1*nb1  + i0*nb0);
-                            const float dy0 = dy[0];
-                            const float dy1 = dy[n_dims/2];
-                            dx[0]        =   dy0*cos_theta + dy1*sin_theta;
-                            dx[n_dims/2] = - dy0*sin_theta + dy1*cos_theta;
-                        }
-                    }
-                }
-            }
-        }
-    }
-}
-static void wsp_ggml_compute_forward_rope_back_f16(
-        const struct wsp_ggml_compute_params * params,
-        const struct wsp_ggml_tensor * src0,
-        const struct wsp_ggml_tensor * src1,
-        struct wsp_ggml_tensor * dst) {
-    if (params->type == WSP_GGML_TASK_INIT || params->type == WSP_GGML_TASK_FINALIZE) {
-        return;
-    }
-    // y = rope(x, src1)
-    // dx = rope_back(dy, src1)
-    // src0 is dy, src1 contains options
-    //const int n_past = ((int32_t *) dst->op_params)[0];
-    const int n_dims = ((int32_t *) dst->op_params)[1];
-    const int mode   = ((int32_t *) dst->op_params)[2];
-    WSP_GGML_TENSOR_UNARY_OP_LOCALS
-    //printf("ne0: %d, ne1: %d, ne2: %d, ne3: %d\n", ne0, ne1, ne2, ne3);
-    //printf("n_past = %d, ne2 = %d\n", n_past, ne2);
-    assert(nb0 == sizeof(wsp_ggml_fp16_t));
-    const int ith = params->ith;
-    const int nth = params->nth;
-    const int nr = wsp_ggml_nrows(dst);
-    // rows per thread
-    const int dr = (nr + nth - 1)/nth;
-    // row range for this thread
-    const int ir0 = dr*ith;
-    const int ir1 = MIN(ir0 + dr, nr);
-    // row index used to determine which thread to use
-    int ir = 0;
-    const float theta_scale = powf(10000.0, -2.0f/n_dims);
-    const bool is_neox = mode & 2;
-    const int32_t * pos = (const int32_t *) src1->data;
-    for (int64_t i3 = 0; i3 < ne3; i3++) {
-        for (int64_t i2 = 0; i2 < ne2; i2++) {
-            const int64_t p = pos[i2];
-            for (int64_t i1 = 0; i1 < ne1; i1++) {
-                if (ir++ < ir0) continue;
-                if (ir   > ir1) break;
-                float theta_base = (float)p;
-                if (!is_neox) {
-                    for (int64_t i0 = 0; i0 < ne0; i0 += 2) {
-                        const float cos_theta = cosf(theta_base);
-                        const float sin_theta = sinf(theta_base);
-                        theta_base *= theta_scale;
-                        const wsp_ggml_fp16_t * const dy  = (wsp_ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
-                              wsp_ggml_fp16_t *       dx  = (wsp_ggml_fp16_t *)((char *)  dst->data + i3*nb3  + i2*nb2  + i1*nb1  + i0*nb0);
-                        const float dy0 = WSP_GGML_FP16_TO_FP32(dy[0]);
-                        const float dy1 = WSP_GGML_FP16_TO_FP32(dy[1]);
-                        dx[0] = WSP_GGML_FP32_TO_FP16( dy0*cos_theta + dy1*sin_theta);
-                        dx[1] = WSP_GGML_FP32_TO_FP16(-dy0*sin_theta + dy1*cos_theta);
-                    }
-                } else {
-                    for (int64_t ib = 0; ib < ne0/n_dims; ++ib) {
-                        for (int64_t ic = 0; ic < n_dims; ic += 2) {
-                            const float cos_theta = cosf(theta_base);
-                            const float sin_theta = sinf(theta_base);
-                            theta_base *= theta_scale;
-                            const int64_t i0 = ib*n_dims + ic/2;
-                            const wsp_ggml_fp16_t * const dy  = (wsp_ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
-                                  wsp_ggml_fp16_t *       dx  = (wsp_ggml_fp16_t *)((char *)  dst->data + i3*nb3  + i2*nb2  + i1*nb1  + i0*nb0);
-                            const float dy0 = WSP_GGML_FP16_TO_FP32(dy[0]);
-                            const float dy1 = WSP_GGML_FP16_TO_FP32(dy[n_dims/2]);
-                            dx[0]        = WSP_GGML_FP32_TO_FP16( dy0*cos_theta + dy1*sin_theta);
-                            dx[n_dims/2] = WSP_GGML_FP32_TO_FP16(-dy0*sin_theta + dy1*cos_theta);
-                        }
-                    }
-                }
-            }
-        }
-    }
-}
-static void wsp_ggml_compute_forward_rope_back(
-        const struct wsp_ggml_compute_params * params,
-        const struct wsp_ggml_tensor * src0,
-        const struct wsp_ggml_tensor * src1,
-        struct wsp_ggml_tensor * dst) {
-    switch (src0->type) {
-        case WSP_GGML_TYPE_F16:
-            {
-                wsp_ggml_compute_forward_rope_back_f16(params, src0, src1, dst);
-            } break;
-        case WSP_GGML_TYPE_F32:
-            {
-                wsp_ggml_compute_forward_rope_back_f32(params, src0, src1, dst);
-            } break;
-        default:
-            {
-                WSP_GGML_ASSERT(false);
-            } break;
-    }
-}
-// wsp_ggml_compute_forward_conv_1d
-static void wsp_ggml_compute_forward_conv_1d_f16_f32(
-        const struct wsp_ggml_compute_params * params,
-        const struct wsp_ggml_tensor * src0,
-        const struct wsp_ggml_tensor * src1,
-              struct wsp_ggml_tensor * dst) {
-    WSP_GGML_ASSERT(src0->type == WSP_GGML_TYPE_F16);
-    WSP_GGML_ASSERT(src1->type == WSP_GGML_TYPE_F32);
-    WSP_GGML_ASSERT( dst->type == WSP_GGML_TYPE_F32);
-    int64_t t0 = wsp_ggml_perf_time_us();
-    UNUSED(t0);
-    WSP_GGML_TENSOR_BINARY_OP_LOCALS
-    const int ith = params->ith;
-    const int nth = params->nth;
-    const int nk = ne00;
-    // size of the convolution row - the kernel size unrolled across all input channels
-    const int ew0 = nk*ne01;
-    const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
-    const int32_t p0 = ((const int32_t*)(dst->op_params))[1];
-    const int32_t d0 = ((const int32_t*)(dst->op_params))[2];
-    WSP_GGML_ASSERT(nb00 == sizeof(wsp_ggml_fp16_t));
-    WSP_GGML_ASSERT(nb10 == sizeof(float));
-    if (params->type == WSP_GGML_TASK_INIT) {
-        memset(params->wdata, 0, params->wsize);
-        wsp_ggml_fp16_t * const wdata = (wsp_ggml_fp16_t *) params->wdata + 0;
-        for (int64_t i11 = 0; i11 < ne11; i11++) {
-            const float * const src = (float *)((char *) src1->data + i11*nb11);
-            wsp_ggml_fp16_t * dst_data = wdata;
-            for (int64_t i0 = 0; i0 < ne0; i0++) {
-                for (int64_t ik = 0; ik < nk; ik++) {
-                    const int idx0 = i0*s0 + ik*d0 - p0;
-                    if(!(idx0 < 0 || idx0 >= ne10)) {
-                        dst_data[i0*ew0 + i11*nk + ik] = WSP_GGML_FP32_TO_FP16(src[idx0]);
-                    }
-                }
-            }
-        }
-        return;
-    }
-    if (params->type == WSP_GGML_TASK_FINALIZE) {
-        return;
-    }
-    // total rows in dst
-    const int nr = ne2;
-    // rows per thread
-    const int dr = (nr + nth - 1)/nth;
-    // row range for this thread
-    const int ir0 = dr*ith;
-    const int ir1 = MIN(ir0 + dr, nr);
-    wsp_ggml_fp16_t * const wdata = (wsp_ggml_fp16_t *) params->wdata + 0;
-    for (int i2 = 0; i2 < ne2; i2++) {
-        for (int i1 = ir0; i1 < ir1; i1++) {
-            float * dst_data = (float *)((char *) dst->data + i2*nb2 + i1*nb1);
-            for (int i0 = 0; i0 < ne0; i0++) {
-                wsp_ggml_vec_dot_f16(ew0, dst_data + i0,
-                        (wsp_ggml_fp16_t *) ((char *) src0->data + i1*nb02),
-                        (wsp_ggml_fp16_t *)                wdata + i2*nb2 + i0*ew0);
-            }
-        }
-    }
-}
-static void wsp_ggml_compute_forward_conv_1d_f32(
-        const struct wsp_ggml_compute_params * params,
-        const struct wsp_ggml_tensor * src0,
-        const struct wsp_ggml_tensor * src1,
-              struct wsp_ggml_tensor * dst) {
-    WSP_GGML_ASSERT(src0->type == WSP_GGML_TYPE_F32);
-    WSP_GGML_ASSERT(src1->type == WSP_GGML_TYPE_F32);
-    WSP_GGML_ASSERT( dst->type == WSP_GGML_TYPE_F32);
-    int64_t t0 = wsp_ggml_perf_time_us();
-    UNUSED(t0);
-    WSP_GGML_TENSOR_BINARY_OP_LOCALS
-    const int ith = params->ith;
-    const int nth = params->nth;
-    const int nk = ne00;
-    const int ew0 = nk*ne01;
-    const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
-    const int32_t p0 = ((const int32_t*)(dst->op_params))[1];
-    const int32_t d0 = ((const int32_t*)(dst->op_params))[2];
-    WSP_GGML_ASSERT(nb00 == sizeof(float));
-    WSP_GGML_ASSERT(nb10 == sizeof(float));
-    if (params->type == WSP_GGML_TASK_INIT) {
-        memset(params->wdata, 0, params->wsize);
-        float * const wdata = (float *) params->wdata + 0;
-        for (int64_t i11 = 0; i11 < ne11; i11++) {
-            const float * const src = (float *)((char *) src1->data + i11*nb11);
-            float * dst_data = wdata;
-            for (int64_t i0 = 0; i0 < ne0; i0++) {
-                for (int64_t ik = 0; ik < nk; ik++) {
-                    const int idx0 = i0*s0 + ik*d0 - p0;
-                    if(!(idx0 < 0 || idx0 >= ne10)) {
-                        dst_data[i0*ew0 + i11*nk + ik] = src[idx0];
-                    }
-                }
-            }
-        }
-        return;
-    }
-    if (params->type == WSP_GGML_TASK_FINALIZE) {
-        return;
-    }
-    // total rows in dst
-    const int nr = ne02;
-    // rows per thread
-    const int dr = (nr + nth - 1)/nth;
-    // row range for this thread
-    const int ir0 = dr*ith;
-    const int ir1 = MIN(ir0 + dr, nr);
-    float * const wdata = (float *) params->wdata + 0;
-    for (int i2 = 0; i2 < ne2; i2++) {
-        for (int i1 = ir0; i1 < ir1; i1++) {
-            float * dst_data = (float *)((char *) dst->data + i2*nb2 + i1*nb1);
-            for (int i0 = 0; i0 < ne0; i0++) {
-                wsp_ggml_vec_dot_f32(ew0, dst_data + i0,
-                        (float *) ((char *) src0->data + i1*nb02),
-                        (float *)                wdata + i2*nb2 + i0*ew0);
-            }
-        }
-    }
-}
-// TODO: reuse wsp_ggml_mul_mat or implement wsp_ggml_im2col and remove stage_0 and stage_1
-static void gemm_f16_out_f32(int64_t m, int64_t n, int64_t k,
-                             wsp_ggml_fp16_t * A,
-                             wsp_ggml_fp16_t * B,
-                             float * C,
-                             const int ith, const int nth) {
-    // does not seem to make a difference
-    int64_t m0, m1, n0, n1;
-    // patches per thread
-    if (m > n) {
-        n0 = 0;
-        n1 = n;
-        // total patches in dst
-        const int np = m;
-        // patches per thread
-        const int dp = (np + nth - 1)/nth;
-        // patch range for this thread
-        m0 = dp*ith;
-        m1 = MIN(m0 + dp, np);
-    } else {
-        m0 = 0;
-        m1 = m;
-        // total patches in dst
-        const int np = n;
-        // patches per thread
-        const int dp = (np + nth - 1)/nth;
-        // patch range for this thread
-        n0 = dp*ith;
-        n1 = MIN(n0 + dp, np);
-    }
-    // block-tiling attempt
-    int64_t blck_n = 16;
-    int64_t blck_m = 16;
-    // int64_t CACHE_SIZE = 2 * 1024 * 1024; // 2MB
-    // int64_t blck_size = CACHE_SIZE / (sizeof(float) + 2 * sizeof(wsp_ggml_fp16_t) * K);
-    // if (blck_size > 0) {
-    //     blck_0 = 4;
-    //     blck_1 = blck_size / blck_0;
-    //     if (blck_1 < 0) {
-    //         blck_1 = 1;
-    //     }
-    //     // blck_0 = (int64_t)sqrt(blck_size);
-    //     // blck_1 = blck_0;
-    // }
-    // // printf("%zd %zd %zd %zd\n", blck_size, K, blck_0, blck_1);
-    for (int j = n0; j < n1; j+=blck_n) {
-        for (int i = m0; i < m1; i+=blck_m) {
-            // printf("i j k => %d %d %d\n", i, j, K);
-            for (int ii = i; ii < i + blck_m && ii < m1; ii++) {
-                for (int jj = j; jj < j + blck_n && jj < n1; jj++) {
-                    wsp_ggml_vec_dot_f16(k,
-                                    C + ii*n + jj,
-                                    A + ii * k,
-                                    B + jj * k);
-                }
-            }
-        }
-    }
-}
-// src0: kernel [OC, IC, K]
-// src1: signal [N, IC, IL]
-// dst:  result [N, OL, IC*K]
-static void wsp_ggml_compute_forward_conv_1d_stage_0_f32(
-        const struct wsp_ggml_compute_params * params,
-        const struct wsp_ggml_tensor * src0,
-        const struct wsp_ggml_tensor * src1,
-              struct wsp_ggml_tensor * dst) {
-    WSP_GGML_ASSERT(src0->type == WSP_GGML_TYPE_F16);
-    WSP_GGML_ASSERT(src1->type == WSP_GGML_TYPE_F32);
-    WSP_GGML_ASSERT( dst->type == WSP_GGML_TYPE_F16);
-    int64_t t0 = wsp_ggml_perf_time_us();
-    UNUSED(t0);
-    WSP_GGML_TENSOR_BINARY_OP_LOCALS;
-    const int64_t N  = ne12;
-    const int64_t IC = ne11;
-    const int64_t IL = ne10;
-    const int64_t K = ne00;
-    const int64_t OL = ne1;
-    const int ith = params->ith;
-    const int nth = params->nth;
-    const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
-    const int32_t p0 = ((const int32_t*)(dst->op_params))[1];
-    const int32_t d0 = ((const int32_t*)(dst->op_params))[2];
-    WSP_GGML_ASSERT(nb00 == sizeof(wsp_ggml_fp16_t));
-    WSP_GGML_ASSERT(nb10 == sizeof(float));
-    if (params->type == WSP_GGML_TASK_INIT) {
-        memset(dst->data, 0, wsp_ggml_nbytes(dst));
-        return;
-    }
-    if (params->type == WSP_GGML_TASK_FINALIZE) {
-        return;
-    }
-    // im2col: [N, IC, IL] => [N, OL, IC*K]
-    {
-        wsp_ggml_fp16_t * const wdata = (wsp_ggml_fp16_t *) dst->data;
-        for (int64_t in = 0; in < N; in++) {
-            for (int64_t iol = 0; iol < OL; iol++) {
-                for (int64_t iic = ith; iic < IC; iic+=nth) {
-                    // micro kernel
-                    wsp_ggml_fp16_t * dst_data = wdata + (in*OL + iol)*(IC*K); // [IC, K]
-                    const float * const src_data = (float *)((char *) src1->data + in*nb12 + iic*nb11); // [IL]
-                    for (int64_t ik = 0; ik < K; ik++) {
-                        const int64_t iil = iol*s0 + ik*d0 - p0;
-                        if (!(iil < 0 || iil >= IL)) {
-                            dst_data[iic*K + ik] = WSP_GGML_FP32_TO_FP16(src_data[iil]);
-                        }
-                    }
-                }
-            }
-        }
-    }
-}
-// gemm: [N, OC, OL] = [OC, IC * K] x [N*OL, IC * K]
-// src0: [OC, IC, K]
-// src1: [N, OL, IC * K]
-// result: [N, OC, OL]
-static void wsp_ggml_compute_forward_conv_1d_stage_1_f16(
-        const struct wsp_ggml_compute_params * params,
-        const struct wsp_ggml_tensor * src0,
-        const struct wsp_ggml_tensor * src1,
-              struct wsp_ggml_tensor * dst) {
-    WSP_GGML_ASSERT(src0->type == WSP_GGML_TYPE_F16);
-    WSP_GGML_ASSERT(src1->type == WSP_GGML_TYPE_F16);
-    WSP_GGML_ASSERT( dst->type == WSP_GGML_TYPE_F32);
-    int64_t t0 = wsp_ggml_perf_time_us();
-    UNUSED(t0);
-    if (params->type == WSP_GGML_TASK_INIT) {
-        return;
-    }
-    if (params->type == WSP_GGML_TASK_FINALIZE) {
-        return;
-    }
-    WSP_GGML_TENSOR_BINARY_OP_LOCALS;
-    WSP_GGML_ASSERT(nb00 == sizeof(wsp_ggml_fp16_t));
-    WSP_GGML_ASSERT(nb10 == sizeof(wsp_ggml_fp16_t));
-    WSP_GGML_ASSERT(nb0  == sizeof(float));
-    const int N = ne12;
-    const int OL = ne11;
-    const int OC = ne02;
-    const int IC = ne01;
-    const int K  = ne00;
-    const int ith = params->ith;
-    const int nth = params->nth;
-    int64_t m = OC;
-    int64_t n = OL;
-    int64_t k = IC * K;
-    // [N, OC, OL] = [OC, IC * K] x [N*OL, IC * K]
-    for (int i = 0; i < N; i++) {
-        wsp_ggml_fp16_t * A = (wsp_ggml_fp16_t *)src0->data; // [m, k]
-        wsp_ggml_fp16_t * B = (wsp_ggml_fp16_t *)src1->data + i * m * k; // [n, k]
-        float * C = (float *)dst->data + i * m * n; // [m, n]
-        gemm_f16_out_f32(m, n, k, A, B, C, ith, nth);
-    }
-}
-static void wsp_ggml_compute_forward_conv_1d(
-        const struct wsp_ggml_compute_params * params,
-        const struct wsp_ggml_tensor * src0,
-        const struct wsp_ggml_tensor * src1,
-              struct wsp_ggml_tensor * dst) {
-    switch(src0->type) {
-        case WSP_GGML_TYPE_F16:
-            {
-                wsp_ggml_compute_forward_conv_1d_f16_f32(params, src0, src1, dst);
-            } break;
-        case WSP_GGML_TYPE_F32:
-            {
-                wsp_ggml_compute_forward_conv_1d_f32(params, src0, src1, dst);
-            } break;
-        default:
-            {
-                WSP_GGML_ASSERT(false);
-            } break;
-    }
-}
-static void wsp_ggml_compute_forward_conv_1d_stage_0(
-        const struct wsp_ggml_compute_params * params,
-        const struct wsp_ggml_tensor * src0,
-        const struct wsp_ggml_tensor * src1,
-              struct wsp_ggml_tensor * dst) {
-    switch(src0->type) {
-        case WSP_GGML_TYPE_F16:
-            {
-                wsp_ggml_compute_forward_conv_1d_stage_0_f32(params, src0, src1, dst);
-            } break;
-        default:
-            {
-                WSP_GGML_ASSERT(false);
-            } break;
-    }
-}
-static void wsp_ggml_compute_forward_conv_1d_stage_1(
-        const struct wsp_ggml_compute_params * params,
-        const struct wsp_ggml_tensor * src0,
-        const struct wsp_ggml_tensor * src1,
-              struct wsp_ggml_tensor * dst) {
-    switch(src0->type) {
-        case WSP_GGML_TYPE_F16:
-            {
-                wsp_ggml_compute_forward_conv_1d_stage_1_f16(params, src0, src1, dst);
-            } break;
-        default:
-            {
-                WSP_GGML_ASSERT(false);
-            } break;
-    }
-}
-// wsp_ggml_compute_forward_conv_transpose_1d
-static void wsp_ggml_compute_forward_conv_transpose_1d_f16_f32(
-        const struct wsp_ggml_compute_params * params,
-        const struct wsp_ggml_tensor * src0,
-        const struct wsp_ggml_tensor * src1,
-              struct wsp_ggml_tensor * dst) {
-    WSP_GGML_ASSERT(src0->type == WSP_GGML_TYPE_F16);
-    WSP_GGML_ASSERT(src1->type == WSP_GGML_TYPE_F32);
-    WSP_GGML_ASSERT( dst->type == WSP_GGML_TYPE_F32);
-    int64_t t0 = wsp_ggml_perf_time_us();
-    UNUSED(t0);
-    WSP_GGML_TENSOR_BINARY_OP_LOCALS
-    const int ith = params->ith;
-    const int nth = params->nth;
-    const int nk = ne00*ne01*ne02;
-    WSP_GGML_ASSERT(nb00 == sizeof(wsp_ggml_fp16_t));
-    WSP_GGML_ASSERT(nb10 == sizeof(float));
-    if (params->type == WSP_GGML_TASK_INIT) {
-        memset(params->wdata, 0, params->wsize);
-        // permute kernel data (src0) from (K x Cout x Cin) to (Cin x K x Cout)
-        {
-            wsp_ggml_fp16_t * const wdata = (wsp_ggml_fp16_t *) params->wdata + 0;
-            for (int64_t i02 = 0; i02 < ne02; i02++) {
-                for (int64_t i01 = 0; i01 < ne01; i01++) {
-                    const wsp_ggml_fp16_t * const src = (wsp_ggml_fp16_t *)((char *) src0->data + i02*nb02 + i01*nb01);
-                    wsp_ggml_fp16_t * dst_data = wdata + i01*ne00*ne02;
-                    for (int64_t i00 = 0; i00 < ne00; i00++) {
-                        dst_data[i00*ne02 + i02] = src[i00];
-                    }
-                }
-            }
-        }
-        // permute source data (src1) from (L x Cin) to (Cin x L)
-        {
-            wsp_ggml_fp16_t * const wdata = (wsp_ggml_fp16_t *) params->wdata + nk;
-            wsp_ggml_fp16_t * dst_data = wdata;
-            for (int64_t i11 = 0; i11 < ne11; i11++) {
-                const float * const src = (float *)((char *) src1->data + i11*nb11);
-                for (int64_t i10 = 0; i10 < ne10; i10++) {
-                    dst_data[i10*ne11 + i11] = WSP_GGML_FP32_TO_FP16(src[i10]);
-                }
-            }
-        }
-        // need to zero dst since we are accumulating into it
-        memset(dst->data, 0, wsp_ggml_nbytes(dst));
-        return;
-    }
-    if (params->type == WSP_GGML_TASK_FINALIZE) {
-        return;
-    }
-    const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
-    // total rows in dst
-    const int nr = ne1;
-    // rows per thread
-    const int dr = (nr + nth - 1)/nth;
-    // row range for this thread
-    const int ir0 = dr*ith;
-    const int ir1 = MIN(ir0 + dr, nr);
-    wsp_ggml_fp16_t * const wdata     = (wsp_ggml_fp16_t *) params->wdata + 0;
-    wsp_ggml_fp16_t * const wdata_src = wdata + nk;
+    wsp_ggml_fp16_t * const wdata     = (wsp_ggml_fp16_t *) params->wdata + 0;
+    wsp_ggml_fp16_t * const wdata_src = wdata + nk;
     for (int i1 = ir0; i1 < ir1; i1++) {
         float * dst_data = (float *)((char *) dst->data + i1*nb1);
@@ -12258,12 +11837,10 @@ static void wsp_ggml_compute_forward_conv_transpose_1d(
     }
 }
-// wsp_ggml_compute_forward_conv_2d
 // src0: kernel [OC, IC, KH, KW]
 // src1: image [N, IC, IH, IW]
 // dst:  result [N, OH, OW, IC*KH*KW]
-static void wsp_ggml_compute_forward_conv_2d_stage_0_f32(
+static void wsp_ggml_compute_forward_im2col_f16(
         const struct wsp_ggml_compute_params * params,
         const struct wsp_ggml_tensor * src0,
         const struct wsp_ggml_tensor * src1,
@@ -12277,218 +11854,35 @@ static void wsp_ggml_compute_forward_conv_2d_stage_0_f32(
     WSP_GGML_TENSOR_BINARY_OP_LOCALS;
-    const int64_t N = ne13;
-    const int64_t IC = ne12;
-    const int64_t IH = ne11;
-    const int64_t IW = ne10;
-    // const int64_t OC = ne03;
-    // const int64_t IC = ne02;
-    const int64_t KH = ne01;
-    const int64_t KW = ne00;
-    const int64_t OH = ne2;
-    const int64_t OW = ne1;
-    const int ith = params->ith;
-    const int nth = params->nth;
-    const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
-    const int32_t s1 = ((const int32_t*)(dst->op_params))[1];
-    const int32_t p0 = ((const int32_t*)(dst->op_params))[2];
-    const int32_t p1 = ((const int32_t*)(dst->op_params))[3];
-    const int32_t d0 = ((const int32_t*)(dst->op_params))[4];
-    const int32_t d1 = ((const int32_t*)(dst->op_params))[5];
-    WSP_GGML_ASSERT(nb00 == sizeof(wsp_ggml_fp16_t));
-    WSP_GGML_ASSERT(nb10 == sizeof(float));
-    if (params->type == WSP_GGML_TASK_INIT) {
-        memset(dst->data, 0, wsp_ggml_nbytes(dst));
-        return;
-    }
-    if (params->type == WSP_GGML_TASK_FINALIZE) {
-        return;
-    }
-    // im2col: [N, IC, IH, IW] => [N, OH, OW, IC*KH*KW]
-    {
-        wsp_ggml_fp16_t * const wdata = (wsp_ggml_fp16_t *) dst->data;
-        for (int64_t in = 0; in < N; in++) {
-            for (int64_t ioh = 0; ioh < OH; ioh++) {
-                for (int64_t iow = 0; iow < OW; iow++) {
-                    for (int64_t iic = ith; iic < IC; iic+=nth) {
-                        // micro kernel
-                        wsp_ggml_fp16_t * dst_data = wdata + (in*OH*OW + ioh*OW + iow)*(IC*KH*KW); // [IC, KH, KW]
-                        const float * const src_data = (float *)((char *) src1->data + in*nb13 + iic*nb12); // [IH, IW]
-                        for (int64_t ikh = 0; ikh < KH; ikh++) {
-                            for (int64_t ikw = 0; ikw < KW; ikw++) {
-                                const int64_t iiw = iow*s0 + ikw*d0 - p0;
-                                const int64_t iih = ioh*s1 + ikh*d1 - p1;
-                                if (!(iih < 0 || iih >= IH || iiw < 0 || iiw >= IW)) {
-                                    dst_data[iic*(KH*KW) + ikh*KW + ikw] = WSP_GGML_FP32_TO_FP16(src_data[iih*IW + iiw]);
-                                }
-                            }
-                        }
-                    }
-                }
-            }
-        }
-    }
-}
-// gemm: [N, OC, OH, OW] = [OC, IC * KH * KW] x [N*OH*OW, IC * KH * KW]
-// src0: [OC, IC, KH, KW]
-// src1: [N, OH, OW, IC * KH * KW]
-// result: [N, OC, OH, OW]
-static void wsp_ggml_compute_forward_conv_2d_stage_1_f16(
-        const struct wsp_ggml_compute_params * params,
-        const struct wsp_ggml_tensor * src0,
-        const struct wsp_ggml_tensor * src1,
-              struct wsp_ggml_tensor * dst) {
-    WSP_GGML_ASSERT(src0->type == WSP_GGML_TYPE_F16);
-    WSP_GGML_ASSERT(src1->type == WSP_GGML_TYPE_F16);
-    WSP_GGML_ASSERT( dst->type == WSP_GGML_TYPE_F32);
-    int64_t t0 = wsp_ggml_perf_time_us();
-    UNUSED(t0);
-    if (params->type == WSP_GGML_TASK_INIT) {
-        return;
-    }
-    if (params->type == WSP_GGML_TASK_FINALIZE) {
-        return;
-    }
-    WSP_GGML_TENSOR_BINARY_OP_LOCALS;
-    WSP_GGML_ASSERT(nb00 == sizeof(wsp_ggml_fp16_t));
-    WSP_GGML_ASSERT(nb10 == sizeof(wsp_ggml_fp16_t));
-    WSP_GGML_ASSERT(nb0  == sizeof(float));
-    const int N = ne13;
-    const int OH = ne12;
-    const int OW = ne11;
-    const int OC = ne03;
-    const int IC = ne02;
-    const int KH = ne01;
-    const int KW = ne00;
+    const int32_t s0 = ((const int32_t *)(dst->op_params))[0];
+    const int32_t s1 = ((const int32_t *)(dst->op_params))[1];
+    const int32_t p0 = ((const int32_t *)(dst->op_params))[2];
+    const int32_t p1 = ((const int32_t *)(dst->op_params))[3];
+    const int32_t d0 = ((const int32_t *)(dst->op_params))[4];
+    const int32_t d1 = ((const int32_t *)(dst->op_params))[5];
+    const bool is_2D = ((const int32_t *)(dst->op_params))[6] == 1;
     const int ith = params->ith;
     const int nth = params->nth;
-    int64_t m = OC;
-    int64_t n = OH * OW;
-    int64_t k = IC * KH * KW;
-    // [N, OC, OH, OW] = [OC, IC * KH * KW] x [N*OH*OW, IC * KH * KW]
-    for (int i = 0; i < N; i++) {
-        wsp_ggml_fp16_t * A = (wsp_ggml_fp16_t *)src0->data; // [m, k]
-        wsp_ggml_fp16_t * B = (wsp_ggml_fp16_t *)src1->data + i * m * k; // [n, k]
-        float * C = (float *)dst->data + i * m * n; // [m, n]
-        gemm_f16_out_f32(m, n, k, A, B, C, ith, nth);
-    }
-}
-static void wsp_ggml_compute_forward_conv_2d_f16_f32(
-        const struct wsp_ggml_compute_params * params,
-        const struct wsp_ggml_tensor * src0,
-        const struct wsp_ggml_tensor * src1,
-              struct wsp_ggml_tensor * dst) {
-    WSP_GGML_ASSERT(src0->type == WSP_GGML_TYPE_F16);
-    WSP_GGML_ASSERT(src1->type == WSP_GGML_TYPE_F32);
-    WSP_GGML_ASSERT( dst->type == WSP_GGML_TYPE_F32);
-    int64_t t0 = wsp_ggml_perf_time_us();
-    UNUSED(t0);
-    WSP_GGML_TENSOR_BINARY_OP_LOCALS
-    // src1: image [N, IC, IH, IW]
-    // src0: kernel [OC, IC, KH, KW]
-    // dst:  result [N, OC, OH, OW]
-    // ne12: IC
-    // ne0: OW
-    // ne1: OH
-    // nk0: KW
-    // nk1: KH
-    // ne13: N
-    const int N = ne13;
-    const int IC = ne12;
-    const int IH = ne11;
-    const int IW = ne10;
-    const int OC = ne03;
-    // const int IC = ne02;
-    const int KH = ne01;
-    const int KW = ne00;
-    const int OH = ne1;
-    const int OW = ne0;
-    const int ith = params->ith;
-    const int nth = params->nth;
+    const int64_t N  = is_2D ? ne13 : ne12;
+    const int64_t IC = is_2D ? ne12 : ne11;
+    const int64_t IH = is_2D ? ne11 : 1;
+    const int64_t IW = ne10;
-    // const int nk0 = ne00;
-    // const int nk1 = ne01;
+    const int64_t KH = is_2D ? ne01 : 1;
+    const int64_t KW = ne00;
-    // size of the convolution row - the kernel size unrolled across all channels
-    // const int ew0 = nk0*nk1*ne02;
-    // ew0: IC*KH*KW
+    const int64_t OH = is_2D ? ne2 : 1;
+    const int64_t OW = ne1;
-    const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
-    const int32_t s1 = ((const int32_t*)(dst->op_params))[1];
-    const int32_t p0 = ((const int32_t*)(dst->op_params))[2];
-    const int32_t p1 = ((const int32_t*)(dst->op_params))[3];
-    const int32_t d0 = ((const int32_t*)(dst->op_params))[4];
-    const int32_t d1 = ((const int32_t*)(dst->op_params))[5];
+    int ofs0 = is_2D ? nb13 : nb12;
+    int ofs1 = is_2D ? nb12 : nb11;
     WSP_GGML_ASSERT(nb00 == sizeof(wsp_ggml_fp16_t));
     WSP_GGML_ASSERT(nb10 == sizeof(float));
     if (params->type == WSP_GGML_TASK_INIT) {
-        memset(params->wdata, 0, params->wsize);
-        // prepare source data (src1)
-        // im2col: [N, IC, IH, IW] => [N*OH*OW, IC*KH*KW]
-        {
-            wsp_ggml_fp16_t * const wdata = (wsp_ggml_fp16_t *) params->wdata + 0;
-            for (int in = 0; in < N; in++) {
-                for (int iic = 0; iic < IC; iic++) {
-                    for (int ioh = 0; ioh < OH; ioh++) {
-                        for (int iow = 0; iow < OW; iow++) {
-                            // micro kernel
-                            wsp_ggml_fp16_t * dst_data = wdata + (in*OH*OW + ioh*OW + iow)*(IC*KH*KW); // [IC, KH, KW]
-                            const float * const src_data = (float *)((char *) src1->data + in*nb13 + iic*nb12); // [IH, IW]
-                            for (int ikh = 0; ikh < KH; ikh++) {
-                                for (int ikw = 0; ikw < KW; ikw++) {
-                                    const int iiw = iow*s0 + ikw*d0 - p0;
-                                    const int iih = ioh*s1 + ikh*d1 - p1;
-                                    if (!(iih < 0 || iih >= IH || iiw < 0 || iiw >= IW)) {
-                                        dst_data[iic*(KH*KW) + ikh*KW + ikw] = WSP_GGML_FP32_TO_FP16(src_data[iih*IW + iiw]);
-                                    }
-                                }
-                            }
-                        }
-                    }
-                }
-            }
-        }
         return;
     }
@@ -12496,69 +11890,39 @@ static void wsp_ggml_compute_forward_conv_2d_f16_f32(
         return;
     }
-    wsp_ggml_fp16_t * const wdata = (wsp_ggml_fp16_t *) params->wdata + 0;
-    // wdata: [N*OH*OW, IC*KH*KW]
-    // dst: result [N, OC, OH, OW]
-    // src0: kernel [OC, IC, KH, KW]
-    int64_t m = OC;
-    int64_t n = OH * OW;
-    int64_t k = IC * KH * KW;
-    // [N, OC, OH, OW] = [OC, IC * KH * KW] x [N*OH*OW, IC * KH * KW]
-    for (int i = 0; i < N; i++) {
-        wsp_ggml_fp16_t * A = (wsp_ggml_fp16_t *)src0->data; // [m, k]
-        wsp_ggml_fp16_t * B = (wsp_ggml_fp16_t *)wdata + i * m * k; // [n, k]
-        float * C = (float *)dst->data + i * m * n; // [m * k]
-        gemm_f16_out_f32(m, n, k, A, B, C, ith, nth);
-    }
-}
-static void wsp_ggml_compute_forward_conv_2d(
-        const struct wsp_ggml_compute_params * params,
-        const struct wsp_ggml_tensor * src0,
-        const struct wsp_ggml_tensor * src1,
-              struct wsp_ggml_tensor * dst) {
-    switch (src0->type) {
-        case WSP_GGML_TYPE_F16:
-            {
-                wsp_ggml_compute_forward_conv_2d_f16_f32(params, src0, src1, dst);
-            } break;
-        case WSP_GGML_TYPE_F32:
-            {
-                //wsp_ggml_compute_forward_conv_2d_f32(params, src0, src1, dst);
-                WSP_GGML_ASSERT(false);
-            } break;
-        default:
-            {
-                WSP_GGML_ASSERT(false);
-            } break;
-    }
-}
-static void wsp_ggml_compute_forward_conv_2d_stage_0(
-        const struct wsp_ggml_compute_params * params,
-        const struct wsp_ggml_tensor * src0,
-        const struct wsp_ggml_tensor * src1,
-              struct wsp_ggml_tensor * dst) {
-    switch (src0->type) {
-        case WSP_GGML_TYPE_F16:
-            {
-                wsp_ggml_compute_forward_conv_2d_stage_0_f32(params, src0, src1, dst);
-            } break;
-        case WSP_GGML_TYPE_F32:
-            {
-                WSP_GGML_ASSERT(false);
-            } break;
-        default:
-            {
-                WSP_GGML_ASSERT(false);
-            } break;
+    // im2col: [N, IC, IH, IW] => [N, OH, OW, IC*KH*KW]
+    {
+        wsp_ggml_fp16_t * const wdata = (wsp_ggml_fp16_t *) dst->data;
+        for (int64_t in = 0; in < N; in++) {
+            for (int64_t ioh = 0; ioh < OH; ioh++) { // 1
+                for (int64_t iow = 0; iow < OW; iow++) {
+                    for (int64_t iic = ith; iic < IC; iic += nth) {
+                        // micro kernel
+                        wsp_ggml_fp16_t * dst_data = wdata + (in*OH*OW + ioh*OW + iow)*(IC*KH*KW); // [IC, KH, KW]
+                        const float * const src_data = (float *)((char *) src1->data + in*ofs0 + iic*ofs1); // [IH, IW]
+                        for (int64_t ikh = 0; ikh < KH; ikh++) {  // 1
+                            for (int64_t ikw = 0; ikw < KW; ikw++) {
+                                const int64_t iiw = iow*s0 + ikw*d0 - p0;
+                                const int64_t iih = ioh*s1 + ikh*d1 - p1;
+                                if (iih < 0 || iih >= IH || iiw < 0 || iiw >= IW) {
+                                    dst_data[iic*(KH*KW) + ikh*KW + ikw] = 0;
+                                } else {
+                                    dst_data[iic*(KH*KW) + ikh*KW + ikw] = WSP_GGML_FP32_TO_FP16(src_data[iih*IW + iiw]);
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        }
     }
 }
-static void wsp_ggml_compute_forward_conv_2d_stage_1(
+static void wsp_ggml_compute_forward_im2col(
         const struct wsp_ggml_compute_params * params,
         const struct wsp_ggml_tensor * src0,
         const struct wsp_ggml_tensor * src1,
@@ -12566,7 +11930,7 @@ static void wsp_ggml_compute_forward_conv_2d_stage_1(
     switch (src0->type) {
         case WSP_GGML_TYPE_F16:
             {
-                wsp_ggml_compute_forward_conv_2d_stage_1_f16(params, src0, src1, dst);
+                wsp_ggml_compute_forward_im2col_f16(params, src0, src1, dst);
             } break;
         case WSP_GGML_TYPE_F32:
             {
@@ -12839,6 +12203,7 @@ static void wsp_ggml_compute_forward_upscale_f32(
     WSP_GGML_ASSERT(src0->nb[0] == sizeof(float));
     const int ith = params->ith;
+    const int nth = params->nth;
     WSP_GGML_TENSOR_UNARY_OP_LOCALS
@@ -12846,16 +12211,17 @@ static void wsp_ggml_compute_forward_upscale_f32(
     // TODO: optimize
-    for (int i03 = 0; i03 < ne03; i03++) {
-        for (int i02 = ith; i02 < ne02; i02++) {
-            for (int m = 0; m < dst->ne[1]; m++) {
-                int i01 = m / scale_factor;
-                for (int n = 0; n < dst->ne[0]; n++) {
-                    int i00 = n / scale_factor;
-                    const float * x = (float *)((char *) src0->data + i00 * nb00 +i01 * nb01 + i02 * nb02 + i03 * nb03);
+    for (int64_t i3 = 0; i3 < ne3; i3++) {
+        const int64_t i03 = i3;
+        for (int64_t i2 = ith; i2 < ne2; i2 += nth) {
+            const int64_t i02 = i2;
+            for (int64_t i1 = 0; i1 < ne1; i1++) {
+                const int64_t i01 = i1 / scale_factor;
+                for (int64_t i0 = 0; i0 < ne0; i0++) {
+                    const int64_t i00 = i0 / scale_factor;
-                    float * y = (float *)((char *) dst->data + n * dst->nb[0] + m * dst->nb[1] + i02 * dst->nb[2] + i03 * dst->nb[3]);
+                    const float * x = (float *)((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
+                          float * y = (float *)((char *)  dst->data +  i0*nb0  +  i1*nb1  +  i2*nb2  +  i3*nb3);
                     *y = *x;
                 }
@@ -12880,6 +12246,125 @@ static void wsp_ggml_compute_forward_upscale(
     }
 }
+// wsp_ggml_compute_forward_pad
+static void wsp_ggml_compute_forward_pad_f32(
+    const struct wsp_ggml_compute_params * params,
+    const struct wsp_ggml_tensor * src0,
+          struct wsp_ggml_tensor * dst) {
+    if (params->type == WSP_GGML_TASK_INIT || params->type == WSP_GGML_TASK_FINALIZE) {
+        return;
+    }
+    WSP_GGML_ASSERT(src0->nb[0] == sizeof(float));
+    WSP_GGML_ASSERT( dst->nb[0] == sizeof(float));
+    const int ith = params->ith;
+    const int nth = params->nth;
+    WSP_GGML_TENSOR_UNARY_OP_LOCALS
+    float * dst_ptr = (float *) dst->data;
+    // TODO: optimize
+    for (int64_t i2 = 0; i2 < ne2; ++i2) {
+        for (int64_t i1 = ith; i1 < ne1; i1 += nth) {
+            for (int64_t i0 = 0; i0 < ne0; ++i0) {
+                for (int64_t i3 = 0; i3 < ne3; ++i3) {
+                    const int64_t dst_idx = i3*(ne0*ne1*ne2) + i2*(ne0*ne1) + i1*ne0 + i0;
+                    const float * src_ptr = (const float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
+                    if (i0 < ne00 && i1 < ne01 && i2 < ne02 && i3 < ne03) {
+                        dst_ptr[dst_idx] = *src_ptr;
+                    } else {
+                        dst_ptr[dst_idx] = 0;
+                    }
+                }
+            }
+        }
+    }
+}
+static void wsp_ggml_compute_forward_pad(
+    const struct wsp_ggml_compute_params * params,
+    const struct wsp_ggml_tensor * src0,
+    struct wsp_ggml_tensor * dst) {
+    switch (src0->type) {
+        case WSP_GGML_TYPE_F32:
+            {
+                wsp_ggml_compute_forward_pad_f32(params, src0, dst);
+            } break;
+        default:
+            {
+                WSP_GGML_ASSERT(false);
+            } break;
+    }
+}
+// wsp_ggml_compute_forward_argsort
+static void wsp_ggml_compute_forward_argsort_f32(
+    const struct wsp_ggml_compute_params * params,
+    const struct wsp_ggml_tensor * src0,
+    struct wsp_ggml_tensor * dst) {
+    if (params->type == WSP_GGML_TASK_INIT || params->type == WSP_GGML_TASK_FINALIZE) {
+        return;
+    }
+    WSP_GGML_TENSOR_UNARY_OP_LOCALS
+    WSP_GGML_ASSERT(nb0 == sizeof(float));
+    const int ith = params->ith;
+    const int nth = params->nth;
+    const int64_t nr = wsp_ggml_nrows(src0);
+    enum wsp_ggml_sort_order order = (enum wsp_ggml_sort_order) wsp_ggml_get_op_params_i32(dst, 0);
+    for (int64_t i = ith; i < nr; i += nth) {
+        int32_t * dst_data = (int32_t *)((char *) dst->data + i*nb1);
+        const float * src_data = (float *)((char *) src0->data + i*nb01);
+        for (int64_t j = 0; j < ne0; j++) {
+            dst_data[j] = j;
+        }
+        // C doesn't have a functional sort, so we do a bubble sort instead
+        for (int64_t j = 0; j < ne0; j++) {
+            for (int64_t k = j + 1; k < ne0; k++) {
+                if ((order == WSP_GGML_SORT_ASC && src_data[dst_data[j]] > src_data[dst_data[k]]) ||
+                    (order == WSP_GGML_SORT_DESC && src_data[dst_data[j]] < src_data[dst_data[k]])) {
+                    int32_t tmp = dst_data[j];
+                    dst_data[j] = dst_data[k];
+                    dst_data[k] = tmp;
+                }
+            }
+        }
+    }
+}
+static void wsp_ggml_compute_forward_argsort(
+    const struct wsp_ggml_compute_params * params,
+    const struct wsp_ggml_tensor * src0,
+    struct wsp_ggml_tensor * dst) {
+    switch (src0->type) {
+        case WSP_GGML_TYPE_F32:
+            {
+                wsp_ggml_compute_forward_argsort_f32(params, src0, dst);
+            } break;
+        default:
+            {
+                WSP_GGML_ASSERT(false);
+            } break;
+    }
+}
 // wsp_ggml_compute_forward_flash_attn
 static void wsp_ggml_compute_forward_flash_attn_f32(
@@ -14026,10 +13511,6 @@ static void wsp_ggml_compute_forward_unary(
             {
                 wsp_ggml_compute_forward_silu(params, src0, dst);
             } break;
-        case WSP_GGML_UNARY_OP_LEAKY:
-            {
-                wsp_ggml_compute_forward_leaky(params, src0, dst);
-            } break;
         default:
             {
                 WSP_GGML_ASSERT(false);
@@ -14701,7 +14182,11 @@ static void wsp_ggml_compute_forward(struct wsp_ggml_compute_params * params, st
             } break;
         case WSP_GGML_OP_MUL_MAT:
             {
-                wsp_ggml_compute_forward_mul_mat(params, tensor->src[0], tensor->src[1], tensor);
+                wsp_ggml_compute_forward_mul_mat(params, tensor->src[0], tensor->src[1], tensor, 0, tensor->ne[1]);
+            } break;
+        case WSP_GGML_OP_MUL_MAT_ID:
+            {
+                wsp_ggml_compute_forward_mul_mat_id(params, tensor->src[0], tensor->src[1], tensor);
             } break;
         case WSP_GGML_OP_OUT_PROD:
             {
@@ -14761,7 +14246,7 @@ static void wsp_ggml_compute_forward(struct wsp_ggml_compute_params * params, st
             } break;
         case WSP_GGML_OP_SOFT_MAX:
             {
-                wsp_ggml_compute_forward_soft_max(params, tensor->src[0], tensor);
+                wsp_ggml_compute_forward_soft_max(params, tensor->src[0], tensor->src[1], tensor);
             } break;
         case WSP_GGML_OP_SOFT_MAX_BACK:
             {
@@ -14783,33 +14268,13 @@ static void wsp_ggml_compute_forward(struct wsp_ggml_compute_params * params, st
             {
                 wsp_ggml_compute_forward_clamp(params, tensor->src[0], tensor);
             } break;
-        case WSP_GGML_OP_CONV_1D:
-            {
-                wsp_ggml_compute_forward_conv_1d(params, tensor->src[0], tensor->src[1], tensor);
-            } break;
-        case WSP_GGML_OP_CONV_1D_STAGE_0:
-            {
-                wsp_ggml_compute_forward_conv_1d_stage_0(params, tensor->src[0], tensor->src[1], tensor);
-            } break;
-        case WSP_GGML_OP_CONV_1D_STAGE_1:
-            {
-                wsp_ggml_compute_forward_conv_1d_stage_1(params, tensor->src[0], tensor->src[1], tensor);
-            } break;
         case WSP_GGML_OP_CONV_TRANSPOSE_1D:
             {
                 wsp_ggml_compute_forward_conv_transpose_1d(params, tensor->src[0], tensor->src[1], tensor);
             } break;
-        case WSP_GGML_OP_CONV_2D:
-            {
-                wsp_ggml_compute_forward_conv_2d(params, tensor->src[0], tensor->src[1], tensor);
-            } break;
-        case WSP_GGML_OP_CONV_2D_STAGE_0:
-            {
-                wsp_ggml_compute_forward_conv_2d_stage_0(params, tensor->src[0], tensor->src[1], tensor);
-            } break;
-        case WSP_GGML_OP_CONV_2D_STAGE_1:
+        case WSP_GGML_OP_IM2COL:
             {
-                wsp_ggml_compute_forward_conv_2d_stage_1(params, tensor->src[0], tensor->src[1], tensor);
+                wsp_ggml_compute_forward_im2col(params, tensor->src[0], tensor->src[1], tensor);
             } break;
         case WSP_GGML_OP_CONV_TRANSPOSE_2D:
             {
@@ -14827,6 +14292,18 @@ static void wsp_ggml_compute_forward(struct wsp_ggml_compute_params * params, st
             {
                 wsp_ggml_compute_forward_upscale(params, tensor->src[0], tensor);
             } break;
+        case WSP_GGML_OP_PAD:
+            {
+                wsp_ggml_compute_forward_pad(params, tensor->src[0], tensor);
+            } break;
+        case WSP_GGML_OP_ARGSORT:
+            {
+                wsp_ggml_compute_forward_argsort(params, tensor->src[0], tensor);
+            } break;
+        case WSP_GGML_OP_LEAKY_RELU:
+            {
+                wsp_ggml_compute_forward_leaky_relu(params, tensor->src[0], tensor);
+            } break;
         case WSP_GGML_OP_FLASH_ATTN:
             {
                 const int32_t t = wsp_ggml_get_op_params_i32(tensor, 0);
@@ -15151,7 +14628,7 @@ void wsp_ggml_build_backward_gradient_checkpointing(
             // insert new tensors recomputing src, reusing already made replacements,
             // remember replacements: remember new tensors with mapping from corresponding gf nodes
             // recurse for input tensors,
-            // unless (i.e. terminating when) input tensors are replacments (like checkpoints)
+            // unless (i.e. terminating when) input tensors are replacements (like checkpoints)
             node->src[k] = wsp_ggml_recompute_graph_node(ctx, gf, replacements, node->src[k]);
         }
         // insert rewritten backward node with replacements made into resulting backward graph gb
@@ -15477,6 +14954,10 @@ static void wsp_ggml_compute_backward(struct wsp_ggml_context * ctx, struct wsp_
                                 zero_table);
                 }
             } break;
+        case WSP_GGML_OP_MUL_MAT_ID:
+            {
+                WSP_GGML_ASSERT(false); // TODO: not implemented
+            } break;
         case WSP_GGML_OP_OUT_PROD:
             {
                 WSP_GGML_ASSERT(false); // TODO: not implemented
@@ -15708,17 +15189,20 @@ static void wsp_ggml_compute_backward(struct wsp_ggml_context * ctx, struct wsp_
                 // necessary for llama
                 if (src0->grad) {
                     //const int n_past = ((int32_t *) tensor->op_params)[0];
-                    const int n_dims = ((int32_t *) tensor->op_params)[1];
-                    const int mode   = ((int32_t *) tensor->op_params)[2];
-                    const int n_ctx  = ((int32_t *) tensor->op_params)[3];
-                    float freq_base;
-                    float freq_scale;
-                    float xpos_base;
-                    bool  xpos_down;
-                    memcpy(&freq_base,  (int32_t *) tensor->op_params + 4, sizeof(float));
-                    memcpy(&freq_scale, (int32_t *) tensor->op_params + 5, sizeof(float));
-                    memcpy(&xpos_base,  (int32_t *) tensor->op_params + 6, sizeof(float));
-                    memcpy(&xpos_down,  (int32_t *) tensor->op_params + 7, sizeof(bool));
+                    const int n_dims     = ((int32_t *) tensor->op_params)[1];
+                    const int mode       = ((int32_t *) tensor->op_params)[2];
+                    const int n_ctx      = ((int32_t *) tensor->op_params)[3];
+                    const int n_orig_ctx = ((int32_t *) tensor->op_params)[4];
+                    float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow, xpos_base, xpos_down;
+                    memcpy(&freq_base,   (int32_t *) tensor->op_params +  5, sizeof(float));
+                    memcpy(&freq_scale,  (int32_t *) tensor->op_params +  6, sizeof(float));
+                    memcpy(&ext_factor,  (int32_t *) tensor->op_params +  7, sizeof(float));
+                    memcpy(&attn_factor, (int32_t *) tensor->op_params +  8, sizeof(float));
+                    memcpy(&beta_fast,   (int32_t *) tensor->op_params +  9, sizeof(float));
+                    memcpy(&beta_slow,   (int32_t *) tensor->op_params + 10, sizeof(float));
+                    memcpy(&xpos_base,   (int32_t *) tensor->op_params + 11, sizeof(float));
+                    memcpy(&xpos_down,   (int32_t *) tensor->op_params + 12, sizeof(bool));
                     src0->grad = wsp_ggml_add_or_set(ctx,
                             src0->grad,
@@ -15728,8 +15212,13 @@ static void wsp_ggml_compute_backward(struct wsp_ggml_context * ctx, struct wsp_
                                 n_dims,
                                 mode,
                                 n_ctx,
+                                n_orig_ctx,
                                 freq_base,
                                 freq_scale,
+                                ext_factor,
+                                attn_factor,
+                                beta_fast,
+                                beta_slow,
                                 xpos_base,
                                 xpos_down),
                             zero_table);
@@ -15739,17 +15228,20 @@ static void wsp_ggml_compute_backward(struct wsp_ggml_context * ctx, struct wsp_
             {
                 if (src0->grad) {
                     //const int n_past = ((int32_t *) tensor->op_params)[0];
-                    const int n_dims = ((int32_t *) tensor->op_params)[1];
-                    const int mode   = ((int32_t *) tensor->op_params)[2];
-                    const int n_ctx  = ((int32_t *) tensor->op_params)[3];
-                    float freq_base;
-                    float freq_scale;
-                    float xpos_base;
-                    bool  xpos_down;
-                    memcpy(&freq_base,  (int32_t *) tensor->op_params + 4, sizeof(float));
-                    memcpy(&freq_scale, (int32_t *) tensor->op_params + 5, sizeof(float));
-                    memcpy(&xpos_base,  (int32_t *) tensor->op_params + 6, sizeof(float));
-                    memcpy(&xpos_down,  (int32_t *) tensor->op_params + 7, sizeof(bool));
+                    const int n_dims     = ((int32_t *) tensor->op_params)[1];
+                    const int mode       = ((int32_t *) tensor->op_params)[2];
+                    const int n_ctx      = ((int32_t *) tensor->op_params)[3];
+                    const int n_orig_ctx = ((int32_t *) tensor->op_params)[4];
+                    float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow, xpos_base, xpos_down;
+                    memcpy(&freq_base,   (int32_t *) tensor->op_params +  5, sizeof(float));
+                    memcpy(&freq_scale,  (int32_t *) tensor->op_params +  6, sizeof(float));
+                    memcpy(&ext_factor,  (int32_t *) tensor->op_params +  7, sizeof(float));
+                    memcpy(&attn_factor, (int32_t *) tensor->op_params +  8, sizeof(float));
+                    memcpy(&beta_fast,   (int32_t *) tensor->op_params +  9, sizeof(float));
+                    memcpy(&beta_slow,   (int32_t *) tensor->op_params + 10, sizeof(float));
+                    memcpy(&xpos_base,   (int32_t *) tensor->op_params + 11, sizeof(float));
+                    memcpy(&xpos_down,   (int32_t *) tensor->op_params + 12, sizeof(bool));
                     src0->grad = wsp_ggml_add_or_set(ctx,
                             src0->grad,
@@ -15758,14 +15250,14 @@ static void wsp_ggml_compute_backward(struct wsp_ggml_context * ctx, struct wsp_
                                 src1,
                                 n_dims,
                                 mode,
-                                0,
                                 n_ctx,
+                                n_orig_ctx,
                                 freq_base,
                                 freq_scale,
-                                0.0f,
-                                1.0f,
-                                0.0f,
-                                0.0f,
+                                ext_factor,
+                                attn_factor,
+                                beta_fast,
+                                beta_slow,
                                 xpos_base,
                                 xpos_down,
                                 false),
@@ -15780,47 +15272,39 @@ static void wsp_ggml_compute_backward(struct wsp_ggml_context * ctx, struct wsp_
             {
                 WSP_GGML_ASSERT(false); // TODO: not implemented
             } break;
-        case WSP_GGML_OP_CONV_1D:
-            {
-                WSP_GGML_ASSERT(false); // TODO: not implemented
-            } break;
-        case WSP_GGML_OP_CONV_1D_STAGE_0:
-            {
-                WSP_GGML_ASSERT(false); // TODO: not implemented
-            } break;
-        case WSP_GGML_OP_CONV_1D_STAGE_1:
+        case WSP_GGML_OP_CONV_TRANSPOSE_1D:
             {
                 WSP_GGML_ASSERT(false); // TODO: not implemented
             } break;
-        case WSP_GGML_OP_CONV_TRANSPOSE_1D:
+        case WSP_GGML_OP_IM2COL:
             {
                 WSP_GGML_ASSERT(false); // TODO: not implemented
             } break;
-        case WSP_GGML_OP_CONV_2D:
+        case WSP_GGML_OP_CONV_TRANSPOSE_2D:
             {
                 WSP_GGML_ASSERT(false); // TODO: not implemented
             } break;
-        case WSP_GGML_OP_CONV_2D_STAGE_0:
+        case WSP_GGML_OP_POOL_1D:
             {
                 WSP_GGML_ASSERT(false); // TODO: not implemented
             } break;
-        case WSP_GGML_OP_CONV_2D_STAGE_1:
+        case WSP_GGML_OP_POOL_2D:
             {
                 WSP_GGML_ASSERT(false); // TODO: not implemented
             } break;
-        case WSP_GGML_OP_CONV_TRANSPOSE_2D:
+        case WSP_GGML_OP_UPSCALE:
             {
                 WSP_GGML_ASSERT(false); // TODO: not implemented
             } break;
-        case WSP_GGML_OP_POOL_1D:
+        case WSP_GGML_OP_PAD:
             {
                 WSP_GGML_ASSERT(false); // TODO: not implemented
             } break;
-        case WSP_GGML_OP_POOL_2D:
+        case WSP_GGML_OP_ARGSORT:
             {
                 WSP_GGML_ASSERT(false); // TODO: not implemented
             } break;
-        case WSP_GGML_OP_UPSCALE:
+        case WSP_GGML_OP_LEAKY_RELU:
             {
                 WSP_GGML_ASSERT(false); // TODO: not implemented
             } break;
@@ -16184,12 +15668,8 @@ struct wsp_ggml_cgraph * wsp_ggml_new_graph(struct wsp_ggml_context * ctx) {
     return wsp_ggml_new_graph_custom(ctx, WSP_GGML_DEFAULT_GRAPH_SIZE, false);
 }
-struct wsp_ggml_cgraph * wsp_ggml_graph_view(struct wsp_ggml_context * ctx, struct wsp_ggml_cgraph * cgraph0, int i0, int i1) {
-    const size_t obj_size = sizeof(struct wsp_ggml_cgraph);
-    struct wsp_ggml_object * obj = wsp_ggml_new_object(ctx, WSP_GGML_OBJECT_GRAPH, obj_size);
-    struct wsp_ggml_cgraph * cgraph = (struct wsp_ggml_cgraph *) ((char *) ctx->mem_buffer + obj->offs);
-    *cgraph = (struct wsp_ggml_cgraph) {
+struct wsp_ggml_cgraph wsp_ggml_graph_view(struct wsp_ggml_cgraph * cgraph0, int i0, int i1) {
+    struct wsp_ggml_cgraph cgraph = {
         /*.size         =*/ 0,
         /*.n_nodes      =*/ i1 - i0,
         /*.n_leafs      =*/ 0,
@@ -16424,7 +15904,6 @@ static int wsp_ggml_get_n_tasks(struct wsp_ggml_tensor * node, int n_threads) {
                 n_tasks = n_threads;
             } break;
         case WSP_GGML_OP_SUB:
-        case WSP_GGML_OP_DIV:
         case WSP_GGML_OP_SQR:
         case WSP_GGML_OP_SQRT:
         case WSP_GGML_OP_LOG:
@@ -16434,6 +15913,7 @@ static int wsp_ggml_get_n_tasks(struct wsp_ggml_tensor * node, int n_threads) {
         case WSP_GGML_OP_ARGMAX:
         case WSP_GGML_OP_REPEAT:
         case WSP_GGML_OP_REPEAT_BACK:
+        case WSP_GGML_OP_LEAKY_RELU:
             {
                 n_tasks = 1;
             } break;
@@ -16446,7 +15926,6 @@ static int wsp_ggml_get_n_tasks(struct wsp_ggml_tensor * node, int n_threads) {
                 case WSP_GGML_UNARY_OP_TANH:
                 case WSP_GGML_UNARY_OP_ELU:
                 case WSP_GGML_UNARY_OP_RELU:
-                case WSP_GGML_UNARY_OP_LEAKY:
                     {
                         n_tasks = 1;
                     } break;
@@ -16457,10 +15936,13 @@ static int wsp_ggml_get_n_tasks(struct wsp_ggml_tensor * node, int n_threads) {
                     {
                         n_tasks = n_threads;
                     } break;
+                default:
+                    WSP_GGML_ASSERT(false);
             }
             break;
         case WSP_GGML_OP_SILU_BACK:
         case WSP_GGML_OP_MUL:
+        case WSP_GGML_OP_DIV:
         case WSP_GGML_OP_NORM:
         case WSP_GGML_OP_RMS_NORM:
         case WSP_GGML_OP_RMS_NORM_BACK:
@@ -16498,6 +15980,11 @@ static int wsp_ggml_get_n_tasks(struct wsp_ggml_tensor * node, int n_threads) {
                 }
 #endif
             } break;
+        case WSP_GGML_OP_MUL_MAT_ID:
+            {
+                // FIXME: blas
+                n_tasks = n_threads;
+            } break;
         case WSP_GGML_OP_OUT_PROD:
             {
                 n_tasks = n_threads;
@@ -16517,7 +16004,6 @@ static int wsp_ggml_get_n_tasks(struct wsp_ggml_tensor * node, int n_threads) {
             } break;
         case WSP_GGML_OP_DIAG_MASK_ZERO:
         case WSP_GGML_OP_DIAG_MASK_INF:
-        case WSP_GGML_OP_SOFT_MAX:
         case WSP_GGML_OP_SOFT_MAX_BACK:
         case WSP_GGML_OP_ROPE:
         case WSP_GGML_OP_ROPE_BACK:
@@ -16533,31 +16019,15 @@ static int wsp_ggml_get_n_tasks(struct wsp_ggml_tensor * node, int n_threads) {
             {
                 n_tasks = 1; //TODO
             } break;
-        case WSP_GGML_OP_CONV_1D:
-            {
-                n_tasks = n_threads;
-            } break;
-        case WSP_GGML_OP_CONV_1D_STAGE_0:
-            {
-                n_tasks = n_threads;
-            } break;
-        case WSP_GGML_OP_CONV_1D_STAGE_1:
+        case WSP_GGML_OP_SOFT_MAX:
             {
-                n_tasks = n_threads;
+                n_tasks = MIN(MIN(4, n_threads), wsp_ggml_nrows(node->src[0]));
             } break;
         case WSP_GGML_OP_CONV_TRANSPOSE_1D:
             {
                 n_tasks = n_threads;
             } break;
-        case WSP_GGML_OP_CONV_2D:
-            {
-                n_tasks = n_threads;
-            } break;
-        case WSP_GGML_OP_CONV_2D_STAGE_0:
-            {
-                n_tasks = n_threads;
-            } break;
-        case WSP_GGML_OP_CONV_2D_STAGE_1:
+        case WSP_GGML_OP_IM2COL:
             {
                 n_tasks = n_threads;
             } break;
@@ -16574,6 +16044,14 @@ static int wsp_ggml_get_n_tasks(struct wsp_ggml_tensor * node, int n_threads) {
             {
                 n_tasks = n_threads;
             } break;
+        case WSP_GGML_OP_PAD:
+            {
+                n_tasks = n_threads;
+            } break;
+        case WSP_GGML_OP_ARGSORT:
+            {
+                n_tasks = n_threads;
+            } break;
         case WSP_GGML_OP_FLASH_ATTN:
             {
                 n_tasks = n_threads;
@@ -16642,6 +16120,12 @@ static int wsp_ggml_get_n_tasks(struct wsp_ggml_tensor * node, int n_threads) {
             } break;
         default:
             {
+                fprintf(stderr, "%s: op not implemented: ", __func__);
+                if (node->op < WSP_GGML_OP_COUNT) {
+                    fprintf(stderr, "%s\n", wsp_ggml_op_name(node->op));
+                } else {
+                    fprintf(stderr, "%d\n", node->op);
+                }
                 WSP_GGML_ASSERT(false);
             } break;
     }
@@ -16782,18 +16266,16 @@ struct wsp_ggml_cplan wsp_ggml_graph_plan(struct wsp_ggml_cgraph * cgraph, int n
     // thread scheduling for the different operations + work buffer size estimation
     for (int i = 0; i < cgraph->n_nodes; i++) {
-        int n_tasks = 1;
         struct wsp_ggml_tensor * node = cgraph->nodes[i];
+        const int n_tasks = wsp_ggml_get_n_tasks(node, n_threads);
         size_t cur = 0;
         switch (node->op) {
             case WSP_GGML_OP_CPY:
             case WSP_GGML_OP_DUP:
                 {
-                    n_tasks = n_threads;
                     if (wsp_ggml_is_quantized(node->type)) {
                         cur = wsp_ggml_type_size(WSP_GGML_TYPE_F32) * node->ne[0] * n_tasks;
                     }
@@ -16801,16 +16283,12 @@ struct wsp_ggml_cplan wsp_ggml_graph_plan(struct wsp_ggml_cgraph * cgraph, int n
             case WSP_GGML_OP_ADD:
             case WSP_GGML_OP_ADD1:
                 {
-                    n_tasks = n_threads;
                     if (wsp_ggml_is_quantized(node->src[0]->type)) {
                         cur = wsp_ggml_type_size(WSP_GGML_TYPE_F32) * node->src[0]->ne[0] * n_tasks;
                     }
                 } break;
             case WSP_GGML_OP_ACC:
                 {
-                    n_tasks = n_threads;
                     if (wsp_ggml_is_quantized(node->src[0]->type)) {
                         cur = wsp_ggml_type_size(WSP_GGML_TYPE_F32) * node->src[1]->ne[0] * n_tasks;
                     }
@@ -16836,45 +16314,32 @@ struct wsp_ggml_cplan wsp_ggml_graph_plan(struct wsp_ggml_cgraph * cgraph, int n
                         cur = wsp_ggml_type_size(vec_dot_type)*wsp_ggml_nelements(node->src[1])/wsp_ggml_blck_size(vec_dot_type);
                     }
                 } break;
+            case WSP_GGML_OP_MUL_MAT_ID:
+                {
+                    const struct wsp_ggml_tensor * a = node->src[2];
+                    const struct wsp_ggml_tensor * b = node->src[1];
+                    const enum wsp_ggml_type vec_dot_type = type_traits[a->type].vec_dot_type;
+#if defined(WSP_GGML_USE_ACCELERATE) || defined(WSP_GGML_USE_OPENBLAS)
+                    if (wsp_ggml_compute_forward_mul_mat_use_blas(a, b, node)) {
+                        if (a->type != WSP_GGML_TYPE_F32) {
+                            // here we need memory just for single 2D matrix from src0
+                            cur = wsp_ggml_type_size(WSP_GGML_TYPE_F32)*(a->ne[0]*a->ne[1]);
+                        }
+                    } else
+#endif
+                    if (b->type != vec_dot_type) {
+                        cur = wsp_ggml_type_size(vec_dot_type)*wsp_ggml_nelements(b)/wsp_ggml_blck_size(vec_dot_type);
+                    }
+                } break;
             case WSP_GGML_OP_OUT_PROD:
                 {
-                    n_tasks = n_threads;
                     if (wsp_ggml_is_quantized(node->src[0]->type)) {
                         cur = wsp_ggml_type_size(WSP_GGML_TYPE_F32) * node->src[0]->ne[0] * n_tasks;
                     }
                 } break;
-            case WSP_GGML_OP_CONV_1D:
+            case WSP_GGML_OP_SOFT_MAX:
                 {
-                    WSP_GGML_ASSERT(node->src[0]->ne[3] == 1);
-                    WSP_GGML_ASSERT(node->src[1]->ne[2] == 1);
-                    WSP_GGML_ASSERT(node->src[1]->ne[3] == 1);
-                    const int64_t ne00 = node->src[0]->ne[0];
-                    const int64_t ne01 = node->src[0]->ne[1];
-                    const int64_t ne02 = node->src[0]->ne[2];
-                    const int64_t ne10 = node->src[1]->ne[0];
-                    const int64_t ne11 = node->src[1]->ne[1];
-                    const int64_t ne0 = node->ne[0];
-                    const int64_t ne1 = node->ne[1];
-                    const int64_t nk  = ne00;
-                    const int64_t ew0 = nk * ne01;
-                    UNUSED(ne02);
-                    UNUSED(ne10);
-                    UNUSED(ne11);
-                    if (node->src[0]->type == WSP_GGML_TYPE_F16 &&
-                        node->src[1]->type == WSP_GGML_TYPE_F32) {
-                        cur = sizeof(wsp_ggml_fp16_t)*(ne0*ne1*ew0);
-                    } else if (node->src[0]->type == WSP_GGML_TYPE_F32 &&
-                               node->src[1]->type == WSP_GGML_TYPE_F32) {
-                        cur = sizeof(float)*(ne0*ne1*ew0);
-                    } else {
-                        WSP_GGML_ASSERT(false);
-                    }
+                    cur = wsp_ggml_type_size(WSP_GGML_TYPE_F32) * node->ne[0] * n_tasks;
                 } break;
             case WSP_GGML_OP_CONV_TRANSPOSE_1D:
                 {
@@ -16901,38 +16366,6 @@ struct wsp_ggml_cplan wsp_ggml_graph_plan(struct wsp_ggml_cgraph * cgraph, int n
                         WSP_GGML_ASSERT(false);
                     }
                 } break;
-            case WSP_GGML_OP_CONV_2D:
-                {
-                    const int64_t ne00 = node->src[0]->ne[0]; // W
-                    const int64_t ne01 = node->src[0]->ne[1]; // H
-                    const int64_t ne02 = node->src[0]->ne[2]; // C
-                    const int64_t ne03 = node->src[0]->ne[3]; // N
-                    const int64_t ne10 = node->src[1]->ne[0]; // W
-                    const int64_t ne11 = node->src[1]->ne[1]; // H
-                    const int64_t ne12 = node->src[1]->ne[2]; // C
-                    const int64_t ne0 = node->ne[0];
-                    const int64_t ne1 = node->ne[1];
-                    const int64_t ne2 = node->ne[2];
-                    const int64_t ne3 = node->ne[3];
-                    const int64_t nk = ne00*ne01;
-                    const int64_t ew0 = nk * ne02;
-                    UNUSED(ne03);
-                    UNUSED(ne2);
-                    if (node->src[0]->type == WSP_GGML_TYPE_F16 &&
-                        node->src[1]->type == WSP_GGML_TYPE_F32) {
-                        // im2col: [N*OH*OW, IC*KH*KW]
-                        cur = sizeof(wsp_ggml_fp16_t)*(ne3*ne0*ne1*ew0);
-                    } else if (node->src[0]->type == WSP_GGML_TYPE_F32 &&
-                               node->src[1]->type == WSP_GGML_TYPE_F32) {
-                        cur = sizeof(float)*      (ne10*ne11*ne12);
-                    } else {
-                        WSP_GGML_ASSERT(false);
-                    }
-                } break;
             case WSP_GGML_OP_CONV_TRANSPOSE_2D:
                 {
                     const int64_t ne00 = node->src[0]->ne[0]; // W
@@ -16949,8 +16382,6 @@ struct wsp_ggml_cplan wsp_ggml_graph_plan(struct wsp_ggml_cgraph * cgraph, int n
                 } break;
             case WSP_GGML_OP_FLASH_ATTN:
                 {
-                    n_tasks = n_threads;
                     const int64_t ne11 = wsp_ggml_up(node->src[1]->ne[1], WSP_GGML_SOFT_MAX_UNROLL);
                     if (node->src[1]->type == WSP_GGML_TYPE_F32) {
@@ -16963,8 +16394,6 @@ struct wsp_ggml_cplan wsp_ggml_graph_plan(struct wsp_ggml_cgraph * cgraph, int n
                 } break;
             case WSP_GGML_OP_FLASH_FF:
                 {
-                    n_tasks = n_threads;
                     if (node->src[1]->type == WSP_GGML_TYPE_F32) {
                         cur  = sizeof(float)*node->src[1]->ne[1]*n_tasks; // TODO: this can become (n_tasks-1)
                         cur += sizeof(float)*node->src[1]->ne[1]*n_tasks; // this is overestimated by x2
@@ -16975,8 +16404,6 @@ struct wsp_ggml_cplan wsp_ggml_graph_plan(struct wsp_ggml_cgraph * cgraph, int n
                 } break;
             case WSP_GGML_OP_FLASH_ATTN_BACK:
                 {
-                    n_tasks = n_threads;
                     const int64_t    D = node->src[0]->ne[0];
                     const int64_t ne11 = wsp_ggml_up(node->src[1]->ne[1], WSP_GGML_SOFT_MAX_UNROLL);
                     const int64_t mxDn = MAX(D, ne11) * 2; // *2 because of S and SM in wsp_ggml_compute_forward_flash_attn_back
@@ -16991,8 +16418,6 @@ struct wsp_ggml_cplan wsp_ggml_graph_plan(struct wsp_ggml_cgraph * cgraph, int n
             case WSP_GGML_OP_CROSS_ENTROPY_LOSS:
                 {
-                    n_tasks = n_threads;
                     cur = wsp_ggml_type_size(node->type)*(n_tasks + node->src[0]->ne[0]*n_tasks);
                 } break;
             case WSP_GGML_OP_COUNT:
@@ -18719,14 +18144,14 @@ enum wsp_ggml_opt_result wsp_ggml_opt_resume_g(
 ////////////////////////////////////////////////////////////////////////////////
-size_t wsp_ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist) {
+size_t wsp_ggml_wsp_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist) {
     assert(k % QK4_0 == 0);
     const int nb = k / QK4_0;
     for (int b = 0; b < n; b += k) {
         block_q4_0 * restrict y = (block_q4_0 *) dst + b/QK4_0;
-        quantize_row_q4_0_reference(src + b, y, k);
+        wsp_quantize_row_q4_0_reference(src + b, y, k);
         for (int i = 0; i < nb; i++) {
             for (int j = 0; j < QK4_0; j += 2) {
@@ -18742,14 +18167,14 @@ size_t wsp_ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64
     return (n/QK4_0*sizeof(block_q4_0));
 }
-size_t wsp_ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist) {
+size_t wsp_ggml_wsp_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist) {
     assert(k % QK4_1 == 0);
     const int nb = k / QK4_1;
     for (int b = 0; b < n; b += k) {
         block_q4_1 * restrict y = (block_q4_1 *) dst + b/QK4_1;
-        quantize_row_q4_1_reference(src + b, y, k);
+        wsp_quantize_row_q4_1_reference(src + b, y, k);
         for (int i = 0; i < nb; i++) {
             for (int j = 0; j < QK4_1; j += 2) {
@@ -18765,22 +18190,22 @@ size_t wsp_ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64
     return (n/QK4_1*sizeof(block_q4_1));
 }
-size_t wsp_ggml_quantize_q5_0(const float * src, void * dst, int n, int k, int64_t * hist) {
+size_t wsp_ggml_wsp_quantize_q5_0(const float * src, void * dst, int n, int k, int64_t * hist) {
     assert(k % QK5_0 == 0);
     const int nb = k / QK5_0;
     for (int b = 0; b < n; b += k) {
         block_q5_0 * restrict y = (block_q5_0 *)dst + b/QK5_0;
-        quantize_row_q5_0_reference(src + b, y, k);
+        wsp_quantize_row_q5_0_reference(src + b, y, k);
         for (int i = 0; i < nb; i++) {
             uint32_t qh;
             memcpy(&qh, &y[i].qh, sizeof(qh));
             for (int j = 0; j < QK5_0; j += 2) {
-                const uint8_t vh0 = ((qh & (1u << (j + 0 ))) >> (j + 0 )) << 4;
-                const uint8_t vh1 = ((qh & (1u << (j + 16))) >> (j + 12));
+                const uint8_t vh0 = ((qh & (1u << (j/2 + 0 ))) >> (j/2 + 0 )) << 4;
+                const uint8_t vh1 = ((qh & (1u << (j/2 + 16))) >> (j/2 + 12));
                 // cast to 16 bins
                 const uint8_t vi0 = ((y[i].qs[j/2] & 0x0F) | vh0) / 2;
@@ -18795,22 +18220,22 @@ size_t wsp_ggml_quantize_q5_0(const float * src, void * dst, int n, int k, int64
     return (n/QK5_0*sizeof(block_q5_0));
 }
-size_t wsp_ggml_quantize_q5_1(const float * src, void * dst, int n, int k, int64_t * hist) {
+size_t wsp_ggml_wsp_quantize_q5_1(const float * src, void * dst, int n, int k, int64_t * hist) {
     assert(k % QK5_1 == 0);
     const int nb = k / QK5_1;
     for (int b = 0; b < n; b += k) {
         block_q5_1 * restrict y = (block_q5_1 *)dst + b/QK5_1;
-        quantize_row_q5_1_reference(src + b, y, k);
+        wsp_quantize_row_q5_1_reference(src + b, y, k);
         for (int i = 0; i < nb; i++) {
             uint32_t qh;
             memcpy(&qh, &y[i].qh, sizeof(qh));
             for (int j = 0; j < QK5_1; j += 2) {
-                const uint8_t vh0 = ((qh & (1u << (j + 0 ))) >> (j + 0 )) << 4;
-                const uint8_t vh1 = ((qh & (1u << (j + 16))) >> (j + 12));
+                const uint8_t vh0 = ((qh & (1u << (j/2 + 0 ))) >> (j/2 + 0 )) << 4;
+                const uint8_t vh1 = ((qh & (1u << (j/2 + 16))) >> (j/2 + 12));
                 // cast to 16 bins
                 const uint8_t vi0 = ((y[i].qs[j/2] & 0x0F) | vh0) / 2;
@@ -18825,14 +18250,14 @@ size_t wsp_ggml_quantize_q5_1(const float * src, void * dst, int n, int k, int64
     return (n/QK5_1*sizeof(block_q5_1));
 }
-size_t wsp_ggml_quantize_q8_0(const float * src, void * dst, int n, int k, int64_t * hist) {
+size_t wsp_ggml_wsp_quantize_q8_0(const float * src, void * dst, int n, int k, int64_t * hist) {
     assert(k % QK8_0 == 0);
     const int nb = k / QK8_0;
     for (int b = 0; b < n; b += k) {
         block_q8_0 * restrict y = (block_q8_0 *)dst + b/QK8_0;
-        quantize_row_q8_0_reference(src + b, y, k);
+        wsp_quantize_row_q8_0_reference(src + b, y, k);
         for (int i = 0; i < nb; i++) {
             for (int j = 0; j < QK8_0; ++j) {
@@ -18846,68 +18271,68 @@ size_t wsp_ggml_quantize_q8_0(const float * src, void * dst, int n, int k, int64
     return (n/QK8_0*sizeof(block_q8_0));
 }
-size_t wsp_ggml_quantize_chunk(enum wsp_ggml_type type, const float * src, void * dst, int start, int n, int64_t * hist) {
+size_t wsp_ggml_wsp_quantize_chunk(enum wsp_ggml_type type, const float * src, void * dst, int start, int n, int64_t * hist) {
     size_t result = 0;
     switch (type) {
         case WSP_GGML_TYPE_Q4_0:
             {
                 WSP_GGML_ASSERT(start % QK4_0 == 0);
                 block_q4_0 * block = (block_q4_0*)dst + start / QK4_0;
-                result = wsp_ggml_quantize_q4_0(src + start, block, n, n, hist);
+                result = wsp_ggml_wsp_quantize_q4_0(src + start, block, n, n, hist);
             } break;
         case WSP_GGML_TYPE_Q4_1:
             {
                 WSP_GGML_ASSERT(start % QK4_1 == 0);
                 block_q4_1 * block = (block_q4_1*)dst + start / QK4_1;
-                result = wsp_ggml_quantize_q4_1(src + start, block, n, n, hist);
+                result = wsp_ggml_wsp_quantize_q4_1(src + start, block, n, n, hist);
             } break;
         case WSP_GGML_TYPE_Q5_0:
             {
                 WSP_GGML_ASSERT(start % QK5_0 == 0);
                 block_q5_0 * block = (block_q5_0*)dst + start / QK5_0;
-                result = wsp_ggml_quantize_q5_0(src + start, block, n, n, hist);
+                result = wsp_ggml_wsp_quantize_q5_0(src + start, block, n, n, hist);
             } break;
         case WSP_GGML_TYPE_Q5_1:
             {
                 WSP_GGML_ASSERT(start % QK5_1 == 0);
                 block_q5_1 * block = (block_q5_1*)dst + start / QK5_1;
-                result = wsp_ggml_quantize_q5_1(src + start, block, n, n, hist);
+                result = wsp_ggml_wsp_quantize_q5_1(src + start, block, n, n, hist);
             } break;
         case WSP_GGML_TYPE_Q8_0:
             {
                 WSP_GGML_ASSERT(start % QK8_0 == 0);
                 block_q8_0 * block = (block_q8_0*)dst + start / QK8_0;
-                result = wsp_ggml_quantize_q8_0(src + start, block, n, n, hist);
+                result = wsp_ggml_wsp_quantize_q8_0(src + start, block, n, n, hist);
             } break;
         case WSP_GGML_TYPE_Q2_K:
             {
                 WSP_GGML_ASSERT(start % QK_K == 0);
                 block_q2_K * block = (block_q2_K*)dst + start / QK_K;
-                result = wsp_ggml_quantize_q2_K(src + start, block, n, n, hist);
+                result = wsp_ggml_wsp_quantize_q2_K(src + start, block, n, n, hist);
             } break;
         case WSP_GGML_TYPE_Q3_K:
             {
                 WSP_GGML_ASSERT(start % QK_K == 0);
                 block_q3_K * block = (block_q3_K*)dst + start / QK_K;
-                result = wsp_ggml_quantize_q3_K(src + start, block, n, n, hist);
+                result = wsp_ggml_wsp_quantize_q3_K(src + start, block, n, n, hist);
             } break;
         case WSP_GGML_TYPE_Q4_K:
             {
                 WSP_GGML_ASSERT(start % QK_K == 0);
                 block_q4_K * block = (block_q4_K*)dst + start / QK_K;
-                result = wsp_ggml_quantize_q4_K(src + start, block, n, n, hist);
+                result = wsp_ggml_wsp_quantize_q4_K(src + start, block, n, n, hist);
             } break;
         case WSP_GGML_TYPE_Q5_K:
             {
                 WSP_GGML_ASSERT(start % QK_K == 0);
                 block_q5_K * block = (block_q5_K*)dst + start / QK_K;
-                result = wsp_ggml_quantize_q5_K(src + start, block, n, n, hist);
+                result = wsp_ggml_wsp_quantize_q5_K(src + start, block, n, n, hist);
             } break;
         case WSP_GGML_TYPE_Q6_K:
             {
                 WSP_GGML_ASSERT(start % QK_K == 0);
                 block_q6_K * block = (block_q6_K*)dst + start / QK_K;
-                result = wsp_ggml_quantize_q6_K(src + start, block, n, n, hist);
+                result = wsp_ggml_wsp_quantize_q6_K(src + start, block, n, n, hist);
             } break;
         case WSP_GGML_TYPE_F16:
             {
@@ -19000,6 +18425,7 @@ struct wsp_gguf_kv {
 struct wsp_gguf_header {
     char magic[4];
     uint32_t version;
     uint64_t n_tensors; // GGUFv2
     uint64_t n_kv;      // GGUFv2
@@ -19089,7 +18515,7 @@ struct wsp_gguf_context * wsp_gguf_init_from_file(const char * fname, struct wsp
         for (uint32_t i = 0; i < sizeof(magic); i++) {
             if (magic[i] != WSP_GGUF_MAGIC[i]) {
-                fprintf(stderr, "%s: invalid magic characters %s.\n", __func__, magic);
+                fprintf(stderr, "%s: invalid magic characters '%c%c%c%c'\n", __func__, magic[0], magic[1], magic[2], magic[3]);
                 fclose(file);
                 return NULL;
             }
@@ -19104,7 +18530,6 @@ struct wsp_gguf_context * wsp_gguf_init_from_file(const char * fname, struct wsp
     {
         strncpy(ctx->header.magic, magic, 4);
         ctx->kv    = NULL;
         ctx->infos = NULL;
         ctx->data  = NULL;
@@ -19132,7 +18557,7 @@ struct wsp_gguf_context * wsp_gguf_init_from_file(const char * fname, struct wsp
     {
         ctx->kv = malloc(ctx->header.n_kv * sizeof(struct wsp_gguf_kv));
-        for (uint32_t i = 0; i < ctx->header.n_kv; ++i) {
+        for (uint64_t i = 0; i < ctx->header.n_kv; ++i) {
             struct wsp_gguf_kv * kv = &ctx->kv[i];
             //fprintf(stderr, "%s: reading kv %d\n", __func__, i);
@@ -19179,7 +18604,7 @@ struct wsp_gguf_context * wsp_gguf_init_from_file(const char * fname, struct wsp
                             case WSP_GGUF_TYPE_STRING:
                                 {
                                     kv->value.arr.data = malloc(kv->value.arr.n * sizeof(struct wsp_gguf_str));
-                                    for (uint32_t j = 0; j < kv->value.arr.n; ++j) {
+                                    for (uint64_t j = 0; j < kv->value.arr.n; ++j) {
                                         ok = ok && wsp_gguf_fread_str(file, &((struct wsp_gguf_str *) kv->value.arr.data)[j], &offset);
                                     }
                                 } break;
@@ -19207,7 +18632,7 @@ struct wsp_gguf_context * wsp_gguf_init_from_file(const char * fname, struct wsp
     {
         ctx->infos = malloc(ctx->header.n_tensors * sizeof(struct wsp_gguf_tensor_info));
-        for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) {
+        for (uint64_t i = 0; i < ctx->header.n_tensors; ++i) {
             struct wsp_gguf_tensor_info * info = &ctx->infos[i];
             for (int j = 0; j < WSP_GGML_MAX_DIMS; ++j) {
@@ -19254,7 +18679,7 @@ struct wsp_gguf_context * wsp_gguf_init_from_file(const char * fname, struct wsp
     // compute the total size of the data section, taking into account the alignment
     {
         ctx->size = 0;
-        for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) {
+        for (uint64_t i = 0; i < ctx->header.n_tensors; ++i) {
             struct wsp_gguf_tensor_info * info = &ctx->infos[i];
             const int64_t ne =
@@ -19323,7 +18748,7 @@ struct wsp_gguf_context * wsp_gguf_init_from_file(const char * fname, struct wsp
         wsp_ggml_set_no_alloc(ctx_data, true);
         // create the tensors
-        for (uint32_t i = 0; i < ctx->header.n_tensors; ++i) {
+        for (uint64_t i = 0; i < ctx->header.n_tensors; ++i) {
             const int64_t ne[WSP_GGML_MAX_DIMS] = {
                 ctx->infos[i].ne[0],
                 ctx->infos[i].ne[1],
@@ -19458,24 +18883,29 @@ int wsp_gguf_find_key(const struct wsp_gguf_context * ctx, const char * key) {
 }
 const char * wsp_gguf_get_key(const struct wsp_gguf_context * ctx, int key_id) {
+    WSP_GGML_ASSERT(key_id >= 0 && key_id < wsp_gguf_get_n_kv(ctx));
     return ctx->kv[key_id].key.data;
 }
 enum wsp_gguf_type wsp_gguf_get_kv_type(const struct wsp_gguf_context * ctx, int key_id) {
+    WSP_GGML_ASSERT(key_id >= 0 && key_id < wsp_gguf_get_n_kv(ctx));
     return ctx->kv[key_id].type;
 }
 enum wsp_gguf_type wsp_gguf_get_arr_type(const struct wsp_gguf_context * ctx, int key_id) {
+    WSP_GGML_ASSERT(key_id >= 0 && key_id < wsp_gguf_get_n_kv(ctx));
     WSP_GGML_ASSERT(ctx->kv[key_id].type == WSP_GGUF_TYPE_ARRAY);
     return ctx->kv[key_id].value.arr.type;
 }
 const void * wsp_gguf_get_arr_data(const struct wsp_gguf_context * ctx, int key_id) {
+    WSP_GGML_ASSERT(key_id >= 0 && key_id < wsp_gguf_get_n_kv(ctx));
     WSP_GGML_ASSERT(ctx->kv[key_id].type == WSP_GGUF_TYPE_ARRAY);
     return ctx->kv[key_id].value.arr.data;
 }
 const char * wsp_gguf_get_arr_str(const struct wsp_gguf_context * ctx, int key_id, int i) {
+    WSP_GGML_ASSERT(key_id >= 0 && key_id < wsp_gguf_get_n_kv(ctx));
     WSP_GGML_ASSERT(ctx->kv[key_id].type == WSP_GGUF_TYPE_ARRAY);
     struct wsp_gguf_kv * kv = &ctx->kv[key_id];
     struct wsp_gguf_str * str = &((struct wsp_gguf_str *) kv->value.arr.data)[i];
@@ -19483,70 +18913,90 @@ const char * wsp_gguf_get_arr_str(const struct wsp_gguf_context * ctx, int key_i
 }
 int wsp_gguf_get_arr_n(const struct wsp_gguf_context * ctx, int key_id) {
+    WSP_GGML_ASSERT(key_id >= 0 && key_id < wsp_gguf_get_n_kv(ctx));
     WSP_GGML_ASSERT(ctx->kv[key_id].type == WSP_GGUF_TYPE_ARRAY);
     return ctx->kv[key_id].value.arr.n;
 }
 uint8_t wsp_gguf_get_val_u8(const struct wsp_gguf_context * ctx, int key_id) {
+    WSP_GGML_ASSERT(key_id >= 0 && key_id < wsp_gguf_get_n_kv(ctx));
     WSP_GGML_ASSERT(ctx->kv[key_id].type == WSP_GGUF_TYPE_UINT8);
     return ctx->kv[key_id].value.uint8;
 }
 int8_t wsp_gguf_get_val_i8(const struct wsp_gguf_context * ctx, int key_id) {
+    WSP_GGML_ASSERT(key_id >= 0 && key_id < wsp_gguf_get_n_kv(ctx));
     WSP_GGML_ASSERT(ctx->kv[key_id].type == WSP_GGUF_TYPE_INT8);
     return ctx->kv[key_id].value.int8;
 }
 uint16_t wsp_gguf_get_val_u16(const struct wsp_gguf_context * ctx, int key_id) {
+    WSP_GGML_ASSERT(key_id >= 0 && key_id < wsp_gguf_get_n_kv(ctx));
     WSP_GGML_ASSERT(ctx->kv[key_id].type == WSP_GGUF_TYPE_UINT16);
     return ctx->kv[key_id].value.uint16;
 }
 int16_t wsp_gguf_get_val_i16(const struct wsp_gguf_context * ctx, int key_id) {
+    WSP_GGML_ASSERT(key_id >= 0 && key_id < wsp_gguf_get_n_kv(ctx));
     WSP_GGML_ASSERT(ctx->kv[key_id].type == WSP_GGUF_TYPE_INT16);
     return ctx->kv[key_id].value.int16;
 }
 uint32_t wsp_gguf_get_val_u32(const struct wsp_gguf_context * ctx, int key_id) {
+    WSP_GGML_ASSERT(key_id >= 0 && key_id < wsp_gguf_get_n_kv(ctx));
     WSP_GGML_ASSERT(ctx->kv[key_id].type == WSP_GGUF_TYPE_UINT32);
     return ctx->kv[key_id].value.uint32;
 }
 int32_t wsp_gguf_get_val_i32(const struct wsp_gguf_context * ctx, int key_id) {
+    WSP_GGML_ASSERT(key_id >= 0 && key_id < wsp_gguf_get_n_kv(ctx));
     WSP_GGML_ASSERT(ctx->kv[key_id].type == WSP_GGUF_TYPE_INT32);
     return ctx->kv[key_id].value.int32;
 }
 float wsp_gguf_get_val_f32(const struct wsp_gguf_context * ctx, int key_id) {
+    WSP_GGML_ASSERT(key_id >= 0 && key_id < wsp_gguf_get_n_kv(ctx));
     WSP_GGML_ASSERT(ctx->kv[key_id].type == WSP_GGUF_TYPE_FLOAT32);
     return ctx->kv[key_id].value.float32;
 }
 uint64_t wsp_gguf_get_val_u64(const struct wsp_gguf_context * ctx, int key_id) {
+    WSP_GGML_ASSERT(key_id >= 0 && key_id < wsp_gguf_get_n_kv(ctx));
     WSP_GGML_ASSERT(ctx->kv[key_id].type == WSP_GGUF_TYPE_UINT64);
     return ctx->kv[key_id].value.uint64;
 }
 int64_t wsp_gguf_get_val_i64(const struct wsp_gguf_context * ctx, int key_id) {
+    WSP_GGML_ASSERT(key_id >= 0 && key_id < wsp_gguf_get_n_kv(ctx));
     WSP_GGML_ASSERT(ctx->kv[key_id].type == WSP_GGUF_TYPE_INT64);
     return ctx->kv[key_id].value.int64;
 }
 double wsp_gguf_get_val_f64(const struct wsp_gguf_context * ctx, int key_id) {
+    WSP_GGML_ASSERT(key_id >= 0 && key_id < wsp_gguf_get_n_kv(ctx));
     WSP_GGML_ASSERT(ctx->kv[key_id].type == WSP_GGUF_TYPE_FLOAT64);
     return ctx->kv[key_id].value.float64;
 }
 bool wsp_gguf_get_val_bool(const struct wsp_gguf_context * ctx, int key_id) {
+    WSP_GGML_ASSERT(key_id >= 0 && key_id < wsp_gguf_get_n_kv(ctx));
     WSP_GGML_ASSERT(ctx->kv[key_id].type == WSP_GGUF_TYPE_BOOL);
     return ctx->kv[key_id].value.bool_;
 }
 const char * wsp_gguf_get_val_str(const struct wsp_gguf_context * ctx, int key_id) {
+    WSP_GGML_ASSERT(key_id >= 0 && key_id < wsp_gguf_get_n_kv(ctx));
     WSP_GGML_ASSERT(ctx->kv[key_id].type == WSP_GGUF_TYPE_STRING);
     return ctx->kv[key_id].value.str.data;
 }
+const void * wsp_gguf_get_val_data(const struct wsp_gguf_context * ctx, int key_id) {
+    WSP_GGML_ASSERT(key_id >= 0 && key_id < wsp_gguf_get_n_kv(ctx));
+    WSP_GGML_ASSERT(ctx->kv[key_id].type != WSP_GGUF_TYPE_ARRAY);
+    WSP_GGML_ASSERT(ctx->kv[key_id].type != WSP_GGUF_TYPE_STRING);
+    return &ctx->kv[key_id].value;
+}
 int wsp_gguf_get_n_tensors(const struct wsp_gguf_context * ctx) {
     return ctx->header.n_tensors;
 }