npm - whisper.rn - Versions diffs - 0.4.0-rc.8 → 0.4.0 - Mend

whisper.rn 0.4.0-rc.8 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (201) hide show

package/cpp/ggml.h CHANGED Viewed

@@ -176,25 +176,15 @@
 #ifdef WSP_GGML_SHARED
 #    if defined(_WIN32) && !defined(__MINGW32__)
 #        ifdef WSP_GGML_BUILD
-#            define WSP_GGML_API __declspec(dllexport)
+#            define WSP_GGML_API __declspec(dllexport) extern
 #        else
-#            define WSP_GGML_API __declspec(dllimport)
+#            define WSP_GGML_API __declspec(dllimport) extern
 #        endif
 #    else
-#        define WSP_GGML_API __attribute__ ((visibility ("default")))
+#        define WSP_GGML_API __attribute__ ((visibility ("default"))) extern
 #    endif
 #else
-#    define WSP_GGML_API
-#endif
-#ifdef WSP_GGML_MULTIPLATFORM
-#    if defined(_WIN32)
-#        define WSP_GGML_CALL
-#    else
-#        define WSP_GGML_CALL __attribute__((__ms_abi__))
-#    endif
-#else
-#    define WSP_GGML_CALL
+#    define WSP_GGML_API extern
 #endif
 // TODO: support for clang
@@ -208,32 +198,36 @@
 #ifndef __GNUC__
 #    define WSP_GGML_ATTRIBUTE_FORMAT(...)
-#elif defined(__MINGW32__)
+#elif defined(__MINGW32__) && !defined(__clang__)
 #    define WSP_GGML_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__)))
 #else
 #    define WSP_GGML_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))
 #endif
-#include <stdint.h>
-#include <stddef.h>
 #include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <stdio.h>
 #define WSP_GGML_FILE_MAGIC   0x67676d6c // "ggml"
-#define WSP_GGML_FILE_VERSION 1
+#define WSP_GGML_FILE_VERSION 2
 #define WSP_GGML_QNT_VERSION        2    // bump this on quantization format changes
 #define WSP_GGML_QNT_VERSION_FACTOR 1000 // do not change this
 #define WSP_GGML_MAX_DIMS           4
 #define WSP_GGML_MAX_PARAMS         2048
-#define WSP_GGML_MAX_CONTEXTS       64
 #define WSP_GGML_MAX_SRC            10
+#define WSP_GGML_MAX_N_THREADS      512
+#define WSP_GGML_MAX_OP_PARAMS      64
 #ifndef WSP_GGML_MAX_NAME
-#define WSP_GGML_MAX_NAME           64
+#   define WSP_GGML_MAX_NAME        64
 #endif
-#define WSP_GGML_MAX_OP_PARAMS      64
 #define WSP_GGML_DEFAULT_N_THREADS  4
 #define WSP_GGML_DEFAULT_GRAPH_SIZE 2048
 #if UINTPTR_MAX == 0xFFFFFFFF
     #define WSP_GGML_MEM_ALIGN 4
 #else
@@ -243,36 +237,35 @@
 #define WSP_GGML_EXIT_SUCCESS 0
 #define WSP_GGML_EXIT_ABORTED 1
-#define WSP_GGUF_MAGIC "GGUF"
-#define WSP_GGUF_VERSION 3
-#define WSP_GGUF_DEFAULT_ALIGNMENT 32
+#define WSP_GGML_ROPE_TYPE_NEOX   2
+#define WSP_GGML_ROPE_TYPE_MROPE  8
+#define WSP_GGML_ROPE_TYPE_VISION 24
 #define WSP_GGML_UNUSED(x) (void)(x)
 #define WSP_GGML_PAD(x, n) (((x) + (n) - 1) & ~((n) - 1))
-#define WSP_GGML_ASSERT(x) \
-    do { \
-        if (!(x)) { \
-            fflush(stdout); \
-            fprintf(stderr, "WSP_GGML_ASSERT: %s:%d: %s\n", __FILE__, __LINE__, #x); \
-            wsp_ggml_print_backtrace(); \
-            abort(); \
-        } \
-    } while (0)
 #ifndef NDEBUG
-#define WSP_GGML_UNREACHABLE() WSP_GGML_ASSERT(!"statement should not be reached")
+#   define WSP_GGML_UNREACHABLE() do { fprintf(stderr, "statement should be unreachable\n"); abort(); } while(0)
 #elif defined(__GNUC__)
-#define WSP_GGML_UNREACHABLE() __builtin_unreachable()
+#   define WSP_GGML_UNREACHABLE() __builtin_unreachable()
+#elif defined(_MSC_VER)
+#   define WSP_GGML_UNREACHABLE() __assume(0)
+#else
+#   define WSP_GGML_UNREACHABLE() ((void) 0)
+#endif
+#ifdef __cplusplus
+#   define WSP_GGML_NORETURN [[noreturn]]
 #elif defined(_MSC_VER)
-#define WSP_GGML_UNREACHABLE() __assume(0)
+#   define WSP_GGML_NORETURN __declspec(noreturn)
 #else
-#define WSP_GGML_UNREACHABLE() ((void) 0)
+#   define WSP_GGML_NORETURN _Noreturn
 #endif
+#define WSP_GGML_ABORT(...) wsp_ggml_abort(__FILE__, __LINE__, __VA_ARGS__)
+#define WSP_GGML_ASSERT(x) if (!(x)) WSP_GGML_ABORT("WSP_GGML_ASSERT(%s) failed", #x)
 // used to copy the number of elements and stride in bytes of tensors into local variables.
 // main purpose is to reduce code duplication and improve readability.
 //
@@ -311,84 +304,125 @@
     WSP_GGML_TENSOR_LOCALS(int64_t, ne,  dst,  ne) \
     WSP_GGML_TENSOR_LOCALS(size_t,  nb,  dst,  nb)
+#define WSP_GGML_TENSOR_BINARY_OP_LOCALS01 \
+    WSP_GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne) \
+    WSP_GGML_TENSOR_LOCALS(size_t,  nb0, src0, nb) \
+    WSP_GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne) \
+    WSP_GGML_TENSOR_LOCALS(size_t,  nb1, src1, nb)
 #ifdef  __cplusplus
 extern "C" {
 #endif
-#if defined(__ARM_NEON) && defined(__CUDACC__)
-    typedef half wsp_ggml_fp16_t;
-#elif defined(__ARM_NEON) && !defined(_MSC_VER)
-    typedef __fp16 wsp_ggml_fp16_t;
-#else
-    typedef uint16_t wsp_ggml_fp16_t;
-#endif
+    WSP_GGML_NORETURN WSP_GGML_ATTRIBUTE_FORMAT(3, 4)
+    WSP_GGML_API void wsp_ggml_abort(const char * file, int line, const char * fmt, ...);
+    enum wsp_ggml_status {
+        WSP_GGML_STATUS_ALLOC_FAILED = -2,
+        WSP_GGML_STATUS_FAILED = -1,
+        WSP_GGML_STATUS_SUCCESS = 0,
+        WSP_GGML_STATUS_ABORTED = 1,
+    };
-    // convert FP16 <-> FP32
-    WSP_GGML_API float       wsp_ggml_fp16_to_fp32(wsp_ggml_fp16_t x);
-    WSP_GGML_API wsp_ggml_fp16_t wsp_ggml_fp32_to_fp16(float x);
+    // get wsp_ggml_status name string
+    WSP_GGML_API const char * wsp_ggml_status_to_string(enum wsp_ggml_status status);
-    WSP_GGML_API void wsp_ggml_fp16_to_fp32_row(const wsp_ggml_fp16_t * x, float * y, int n);
-    WSP_GGML_API void wsp_ggml_fp32_to_fp16_row(const float * x, wsp_ggml_fp16_t * y, int n);
+    // ieee 754-2008 half-precision float16
+    // todo: make this not an integral type
+    typedef uint16_t wsp_ggml_fp16_t;
+    WSP_GGML_API float       wsp_ggml_fp16_to_fp32(wsp_ggml_fp16_t);
+    WSP_GGML_API wsp_ggml_fp16_t wsp_ggml_fp32_to_fp16(float);
+    WSP_GGML_API void        wsp_ggml_fp16_to_fp32_row(const wsp_ggml_fp16_t *, float *, int64_t);
+    WSP_GGML_API void        wsp_ggml_fp32_to_fp16_row(const float *, wsp_ggml_fp16_t *, int64_t);
+    // google brain half-precision bfloat16
+    typedef struct { uint16_t bits; } wsp_ggml_bf16_t;
+    WSP_GGML_API wsp_ggml_bf16_t wsp_ggml_fp32_to_bf16(float);
+    WSP_GGML_API float       wsp_ggml_bf16_to_fp32(wsp_ggml_bf16_t);  // consider just doing << 16
+    WSP_GGML_API void        wsp_ggml_bf16_to_fp32_row(const wsp_ggml_bf16_t *, float *, int64_t);
+    WSP_GGML_API void        wsp_ggml_fp32_to_bf16_row_ref(const float *, wsp_ggml_bf16_t *, int64_t);
+    WSP_GGML_API void        wsp_ggml_fp32_to_bf16_row(const float *, wsp_ggml_bf16_t *, int64_t);
     struct wsp_ggml_object;
     struct wsp_ggml_context;
+    struct wsp_ggml_cgraph;
+    // NOTE: always add types at the end of the enum to keep backward compatibility
     enum wsp_ggml_type {
-        WSP_GGML_TYPE_F32  = 0,
-        WSP_GGML_TYPE_F16  = 1,
-        WSP_GGML_TYPE_Q4_0 = 2,
-        WSP_GGML_TYPE_Q4_1 = 3,
+        WSP_GGML_TYPE_F32     = 0,
+        WSP_GGML_TYPE_F16     = 1,
+        WSP_GGML_TYPE_Q4_0    = 2,
+        WSP_GGML_TYPE_Q4_1    = 3,
         // WSP_GGML_TYPE_Q4_2 = 4, support has been removed
-        // WSP_GGML_TYPE_Q4_3 (5) support has been removed
-        WSP_GGML_TYPE_Q5_0 = 6,
-        WSP_GGML_TYPE_Q5_1 = 7,
-        WSP_GGML_TYPE_Q8_0 = 8,
-        WSP_GGML_TYPE_Q8_1 = 9,
-        // k-quantizations
-        WSP_GGML_TYPE_Q2_K = 10,
-        WSP_GGML_TYPE_Q3_K = 11,
-        WSP_GGML_TYPE_Q4_K = 12,
-        WSP_GGML_TYPE_Q5_K = 13,
-        WSP_GGML_TYPE_Q6_K = 14,
-        WSP_GGML_TYPE_Q8_K = 15,
+        // WSP_GGML_TYPE_Q4_3 = 5, support has been removed
+        WSP_GGML_TYPE_Q5_0    = 6,
+        WSP_GGML_TYPE_Q5_1    = 7,
+        WSP_GGML_TYPE_Q8_0    = 8,
+        WSP_GGML_TYPE_Q8_1    = 9,
+        WSP_GGML_TYPE_Q2_K    = 10,
+        WSP_GGML_TYPE_Q3_K    = 11,
+        WSP_GGML_TYPE_Q4_K    = 12,
+        WSP_GGML_TYPE_Q5_K    = 13,
+        WSP_GGML_TYPE_Q6_K    = 14,
+        WSP_GGML_TYPE_Q8_K    = 15,
         WSP_GGML_TYPE_IQ2_XXS = 16,
         WSP_GGML_TYPE_IQ2_XS  = 17,
-        WSP_GGML_TYPE_I8,
-        WSP_GGML_TYPE_I16,
-        WSP_GGML_TYPE_I32,
-        WSP_GGML_TYPE_COUNT,
+        WSP_GGML_TYPE_IQ3_XXS = 18,
+        WSP_GGML_TYPE_IQ1_S   = 19,
+        WSP_GGML_TYPE_IQ4_NL  = 20,
+        WSP_GGML_TYPE_IQ3_S   = 21,
+        WSP_GGML_TYPE_IQ2_S   = 22,
+        WSP_GGML_TYPE_IQ4_XS  = 23,
+        WSP_GGML_TYPE_I8      = 24,
+        WSP_GGML_TYPE_I16     = 25,
+        WSP_GGML_TYPE_I32     = 26,
+        WSP_GGML_TYPE_I64     = 27,
+        WSP_GGML_TYPE_F64     = 28,
+        WSP_GGML_TYPE_IQ1_M   = 29,
+        WSP_GGML_TYPE_BF16    = 30,
+        // WSP_GGML_TYPE_Q4_0_4_4 = 31, support has been removed from gguf files
+        // WSP_GGML_TYPE_Q4_0_4_8 = 32,
+        // WSP_GGML_TYPE_Q4_0_8_8 = 33,
+        WSP_GGML_TYPE_TQ1_0   = 34,
+        WSP_GGML_TYPE_TQ2_0   = 35,
+        // WSP_GGML_TYPE_IQ4_NL_4_4 = 36,
+        // WSP_GGML_TYPE_IQ4_NL_4_8 = 37,
+        // WSP_GGML_TYPE_IQ4_NL_8_8 = 38,
+        WSP_GGML_TYPE_COUNT   = 39,
     };
     // precision
     enum wsp_ggml_prec {
-        WSP_GGML_PREC_DEFAULT,
-        WSP_GGML_PREC_F32,
-    };
-    enum wsp_ggml_backend_type {
-        WSP_GGML_BACKEND_CPU = 0,
-        WSP_GGML_BACKEND_GPU = 10,
-        WSP_GGML_BACKEND_GPU_SPLIT = 20,
+        WSP_GGML_PREC_DEFAULT =  0, // stored as wsp_ggml_tensor.op_params, 0 by default
+        WSP_GGML_PREC_F32     = 10,
     };
     // model file types
     enum wsp_ggml_ftype {
-        WSP_GGML_FTYPE_UNKNOWN     = -1,
-        WSP_GGML_FTYPE_ALL_F32     = 0,
-        WSP_GGML_FTYPE_MOSTLY_F16  = 1,  // except 1d tensors
-        WSP_GGML_FTYPE_MOSTLY_Q4_0 = 2,  // except 1d tensors
-        WSP_GGML_FTYPE_MOSTLY_Q4_1 = 3,  // except 1d tensors
+        WSP_GGML_FTYPE_UNKNOWN        = -1,
+        WSP_GGML_FTYPE_ALL_F32        = 0,
+        WSP_GGML_FTYPE_MOSTLY_F16     = 1,  // except 1d tensors
+        WSP_GGML_FTYPE_MOSTLY_Q4_0    = 2,  // except 1d tensors
+        WSP_GGML_FTYPE_MOSTLY_Q4_1    = 3,  // except 1d tensors
         WSP_GGML_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
-        WSP_GGML_FTYPE_MOSTLY_Q8_0 = 7,  // except 1d tensors
-        WSP_GGML_FTYPE_MOSTLY_Q5_0 = 8,  // except 1d tensors
-        WSP_GGML_FTYPE_MOSTLY_Q5_1 = 9,  // except 1d tensors
-        WSP_GGML_FTYPE_MOSTLY_Q2_K = 10, // except 1d tensors
-        WSP_GGML_FTYPE_MOSTLY_Q3_K = 11, // except 1d tensors
-        WSP_GGML_FTYPE_MOSTLY_Q4_K = 12, // except 1d tensors
-        WSP_GGML_FTYPE_MOSTLY_Q5_K = 13, // except 1d tensors
-        WSP_GGML_FTYPE_MOSTLY_Q6_K = 14, // except 1d tensors
+        WSP_GGML_FTYPE_MOSTLY_Q8_0    = 7,  // except 1d tensors
+        WSP_GGML_FTYPE_MOSTLY_Q5_0    = 8,  // except 1d tensors
+        WSP_GGML_FTYPE_MOSTLY_Q5_1    = 9,  // except 1d tensors
+        WSP_GGML_FTYPE_MOSTLY_Q2_K    = 10, // except 1d tensors
+        WSP_GGML_FTYPE_MOSTLY_Q3_K    = 11, // except 1d tensors
+        WSP_GGML_FTYPE_MOSTLY_Q4_K    = 12, // except 1d tensors
+        WSP_GGML_FTYPE_MOSTLY_Q5_K    = 13, // except 1d tensors
+        WSP_GGML_FTYPE_MOSTLY_Q6_K    = 14, // except 1d tensors
         WSP_GGML_FTYPE_MOSTLY_IQ2_XXS = 15, // except 1d tensors
         WSP_GGML_FTYPE_MOSTLY_IQ2_XS  = 16, // except 1d tensors
+        WSP_GGML_FTYPE_MOSTLY_IQ3_XXS = 17, // except 1d tensors
+        WSP_GGML_FTYPE_MOSTLY_IQ1_S   = 18, // except 1d tensors
+        WSP_GGML_FTYPE_MOSTLY_IQ4_NL  = 19, // except 1d tensors
+        WSP_GGML_FTYPE_MOSTLY_IQ3_S   = 20, // except 1d tensors
+        WSP_GGML_FTYPE_MOSTLY_IQ2_S   = 21, // except 1d tensors
+        WSP_GGML_FTYPE_MOSTLY_IQ4_XS  = 22, // except 1d tensors
+        WSP_GGML_FTYPE_MOSTLY_IQ1_M   = 23, // except 1d tensors
+        WSP_GGML_FTYPE_MOSTLY_BF16    = 24, // except 1d tensors
     };
     // available tensor operations:
@@ -405,10 +439,13 @@ extern "C" {
         WSP_GGML_OP_SQR,
         WSP_GGML_OP_SQRT,
         WSP_GGML_OP_LOG,
+        WSP_GGML_OP_SIN,
+        WSP_GGML_OP_COS,
         WSP_GGML_OP_SUM,
         WSP_GGML_OP_SUM_ROWS,
         WSP_GGML_OP_MEAN,
         WSP_GGML_OP_ARGMAX,
+        WSP_GGML_OP_COUNT_EQUAL,
         WSP_GGML_OP_REPEAT,
         WSP_GGML_OP_REPEAT_BACK,
         WSP_GGML_OP_CONCAT,
@@ -417,6 +454,7 @@ extern "C" {
         WSP_GGML_OP_RMS_NORM,
         WSP_GGML_OP_RMS_NORM_BACK,
         WSP_GGML_OP_GROUP_NORM,
+        WSP_GGML_OP_L2_NORM,
         WSP_GGML_OP_MUL_MAT,
         WSP_GGML_OP_MUL_MAT_ID,
@@ -439,41 +477,47 @@ extern "C" {
         WSP_GGML_OP_SOFT_MAX_BACK,
         WSP_GGML_OP_ROPE,
         WSP_GGML_OP_ROPE_BACK,
-        WSP_GGML_OP_ALIBI,
         WSP_GGML_OP_CLAMP,
         WSP_GGML_OP_CONV_TRANSPOSE_1D,
         WSP_GGML_OP_IM2COL,
+        WSP_GGML_OP_IM2COL_BACK,
+        WSP_GGML_OP_CONV_2D_DW,
         WSP_GGML_OP_CONV_TRANSPOSE_2D,
         WSP_GGML_OP_POOL_1D,
         WSP_GGML_OP_POOL_2D,
+        WSP_GGML_OP_POOL_2D_BACK,
         WSP_GGML_OP_UPSCALE, // nearest interpolate
         WSP_GGML_OP_PAD,
+        WSP_GGML_OP_PAD_REFLECT_1D,
+        WSP_GGML_OP_ROLL,
+        WSP_GGML_OP_ARANGE,
+        WSP_GGML_OP_TIMESTEP_EMBEDDING,
         WSP_GGML_OP_ARGSORT,
         WSP_GGML_OP_LEAKY_RELU,
-        WSP_GGML_OP_FLASH_ATTN,
-        WSP_GGML_OP_FLASH_FF,
+        WSP_GGML_OP_FLASH_ATTN_EXT,
         WSP_GGML_OP_FLASH_ATTN_BACK,
+        WSP_GGML_OP_SSM_CONV,
+        WSP_GGML_OP_SSM_SCAN,
         WSP_GGML_OP_WIN_PART,
         WSP_GGML_OP_WIN_UNPART,
         WSP_GGML_OP_GET_REL_POS,
         WSP_GGML_OP_ADD_REL_POS,
+        WSP_GGML_OP_RWKV_WKV6,
+        WSP_GGML_OP_GATED_LINEAR_ATTN,
+        WSP_GGML_OP_RWKV_WKV7,
         WSP_GGML_OP_UNARY,
-        WSP_GGML_OP_MAP_UNARY,
-        WSP_GGML_OP_MAP_BINARY,
-        WSP_GGML_OP_MAP_CUSTOM1_F32,
-        WSP_GGML_OP_MAP_CUSTOM2_F32,
-        WSP_GGML_OP_MAP_CUSTOM3_F32,
         WSP_GGML_OP_MAP_CUSTOM1,
         WSP_GGML_OP_MAP_CUSTOM2,
         WSP_GGML_OP_MAP_CUSTOM3,
+        WSP_GGML_OP_CUSTOM,
         WSP_GGML_OP_CROSS_ENTROPY_LOSS,
         WSP_GGML_OP_CROSS_ENTROPY_LOSS_BACK,
+        WSP_GGML_OP_OPT_STEP_ADAMW,
         WSP_GGML_OP_COUNT,
     };
@@ -486,44 +530,51 @@ extern "C" {
         WSP_GGML_UNARY_OP_TANH,
         WSP_GGML_UNARY_OP_ELU,
         WSP_GGML_UNARY_OP_RELU,
+        WSP_GGML_UNARY_OP_SIGMOID,
         WSP_GGML_UNARY_OP_GELU,
         WSP_GGML_UNARY_OP_GELU_QUICK,
         WSP_GGML_UNARY_OP_SILU,
+        WSP_GGML_UNARY_OP_HARDSWISH,
+        WSP_GGML_UNARY_OP_HARDSIGMOID,
+        WSP_GGML_UNARY_OP_EXP,
+        WSP_GGML_UNARY_OP_GELU_ERF,
         WSP_GGML_UNARY_OP_COUNT,
     };
     enum wsp_ggml_object_type {
-        WSP_GGML_OBJECT_TENSOR,
-        WSP_GGML_OBJECT_GRAPH,
-        WSP_GGML_OBJECT_WORK_BUFFER
+        WSP_GGML_OBJECT_TYPE_TENSOR,
+        WSP_GGML_OBJECT_TYPE_GRAPH,
+        WSP_GGML_OBJECT_TYPE_WORK_BUFFER
     };
     enum wsp_ggml_log_level {
-        WSP_GGML_LOG_LEVEL_ERROR = 2,
-        WSP_GGML_LOG_LEVEL_WARN = 3,
-        WSP_GGML_LOG_LEVEL_INFO = 4,
-        WSP_GGML_LOG_LEVEL_DEBUG = 5
+        WSP_GGML_LOG_LEVEL_NONE  = 0,
+        WSP_GGML_LOG_LEVEL_DEBUG = 1,
+        WSP_GGML_LOG_LEVEL_INFO  = 2,
+        WSP_GGML_LOG_LEVEL_WARN  = 3,
+        WSP_GGML_LOG_LEVEL_ERROR = 4,
+        WSP_GGML_LOG_LEVEL_CONT  = 5, // continue previous log
     };
-    // ggml object
-    struct wsp_ggml_object {
-        size_t offs;
-        size_t size;
-        struct wsp_ggml_object * next;
-        enum wsp_ggml_object_type type;
-        char padding[4];
+    // this tensor...
+    enum wsp_ggml_tensor_flag {
+        WSP_GGML_TENSOR_FLAG_INPUT  =  1, // ...is an input for the GGML compute graph
+        WSP_GGML_TENSOR_FLAG_OUTPUT =  2, // ...is an output for the GGML compute graph
+        WSP_GGML_TENSOR_FLAG_PARAM  =  4, // ...contains trainable parameters
+        WSP_GGML_TENSOR_FLAG_LOSS   =  8, // ...defines loss for numerical optimization (multiple loss tensors add up)
     };
-    static const size_t WSP_GGML_OBJECT_SIZE = sizeof(struct wsp_ggml_object);
+    struct wsp_ggml_init_params {
+        // memory pool
+        size_t mem_size;   // bytes
+        void * mem_buffer; // if NULL, memory will be allocated internally
+        bool   no_alloc;   // don't allocate memory for the tensor data
+    };
     // n-dimensional tensor
     struct wsp_ggml_tensor {
-        enum wsp_ggml_type         type;
-        enum wsp_ggml_backend_type backend;
+        enum wsp_ggml_type type;
         struct wsp_ggml_backend_buffer * buffer;
@@ -539,16 +590,11 @@ extern "C" {
         // op params - allocated as int32_t for alignment
         int32_t op_params[WSP_GGML_MAX_OP_PARAMS / sizeof(int32_t)];
-        bool is_param;
+        int32_t flags;
-        struct wsp_ggml_tensor * grad;
         struct wsp_ggml_tensor * src[WSP_GGML_MAX_SRC];
-        // performance
-        int     perf_runs;
-        int64_t perf_cycles;
-        int64_t perf_time_us;
+        // source tensor and offset for views
         struct wsp_ggml_tensor * view_src;
         size_t               view_offs;
@@ -563,85 +609,21 @@ extern "C" {
     static const size_t WSP_GGML_TENSOR_SIZE = sizeof(struct wsp_ggml_tensor);
-    // the compute plan that needs to be prepared for wsp_ggml_graph_compute()
-    // since https://github.com/ggerganov/ggml/issues/287
-    struct wsp_ggml_cplan {
-        size_t    work_size; // size of work buffer, calculated by `wsp_ggml_graph_plan()`
-        uint8_t * work_data; // work buffer, to be allocated by caller before calling to `wsp_ggml_graph_compute()`
+    // Abort callback
+    // If not NULL, called before ggml computation
+    // If it returns true, the computation is aborted
+    typedef bool (*wsp_ggml_abort_callback)(void * data);
-        int n_threads;
-        // abort wsp_ggml_graph_compute when true
-        bool (*abort_callback)(void * data);
-        void * abort_callback_data;
-    };
-    enum wsp_ggml_cgraph_eval_order {
-        WSP_GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT = 0,
-        WSP_GGML_CGRAPH_EVAL_ORDER_RIGHT_TO_LEFT,
-        WSP_GGML_CGRAPH_EVAL_ORDER_COUNT
-    };
-    struct wsp_ggml_hash_set {
-        size_t size;
-        struct wsp_ggml_tensor ** keys;
-    };
-    // computation graph
-    struct wsp_ggml_cgraph {
-        int size;
-        int n_nodes;
-        int n_leafs;
-        struct wsp_ggml_tensor ** nodes;
-        struct wsp_ggml_tensor ** grads;
-        struct wsp_ggml_tensor ** leafs;
-        struct wsp_ggml_hash_set visited_hash_table;
-        enum wsp_ggml_cgraph_eval_order order;
-        // performance
-        int     perf_runs;
-        int64_t perf_cycles;
-        int64_t perf_time_us;
-    };
-    // scratch buffer
-    struct wsp_ggml_scratch {
-        size_t offs;
-        size_t size;
-        void * data;
-    };
-    struct wsp_ggml_init_params {
-        // memory pool
-        size_t mem_size;   // bytes
-        void * mem_buffer; // if NULL, memory will be allocated internally
-        bool   no_alloc;   // don't allocate memory for the tensor data
-    };
-    // compute types
-    // NOTE: the INIT or FINALIZE pass is not scheduled unless explicitly enabled.
-    // This behavior was changed since https://github.com/ggerganov/llama.cpp/pull/1995.
-    enum wsp_ggml_task_type {
-        WSP_GGML_TASK_INIT = 0,
-        WSP_GGML_TASK_COMPUTE,
-        WSP_GGML_TASK_FINALIZE,
-    };
-    struct wsp_ggml_compute_params {
-        enum wsp_ggml_task_type type;
+    //
+    // GUID
+    //
-        // ith = thread index, nth = number of threads
-        int ith, nth;
+    // GUID types
+    typedef uint8_t wsp_ggml_guid[16];
+    typedef wsp_ggml_guid * wsp_ggml_guid_t;
-        // work buffer for all threads
-        size_t wsize;
-        void * wdata;
-    };
+    WSP_GGML_API bool wsp_ggml_guid_matches(wsp_ggml_guid_t guid_a, wsp_ggml_guid_t guid_b);
     // misc
@@ -651,63 +633,78 @@ extern "C" {
     WSP_GGML_API int64_t wsp_ggml_cycles(void);
     WSP_GGML_API int64_t wsp_ggml_cycles_per_ms(void);
-    WSP_GGML_API void    wsp_ggml_print_backtrace(void);
-    WSP_GGML_API void    wsp_ggml_numa_init(void); // call once for better performance on NUMA systems
-    WSP_GGML_API bool    wsp_ggml_is_numa(void); // true if init detected that system has >1 NUMA node
+    // accepts a UTF-8 path, even on Windows
+    WSP_GGML_API FILE *  wsp_ggml_fopen(const char * fname, const char * mode);
     WSP_GGML_API void    wsp_ggml_print_object (const struct wsp_ggml_object * obj);
     WSP_GGML_API void    wsp_ggml_print_objects(const struct wsp_ggml_context * ctx);
-    WSP_GGML_API WSP_GGML_CALL int64_t wsp_ggml_nelements   (const struct wsp_ggml_tensor * tensor);
-    WSP_GGML_API WSP_GGML_CALL int64_t wsp_ggml_nrows       (const struct wsp_ggml_tensor * tensor);
-    WSP_GGML_API WSP_GGML_CALL size_t  wsp_ggml_nbytes      (const struct wsp_ggml_tensor * tensor);
-    WSP_GGML_API           size_t  wsp_ggml_nbytes_pad  (const struct wsp_ggml_tensor * tensor); // same as wsp_ggml_nbytes() but padded to WSP_GGML_MEM_ALIGN
+    WSP_GGML_API int64_t wsp_ggml_nelements (const struct wsp_ggml_tensor * tensor);
+    WSP_GGML_API int64_t wsp_ggml_nrows     (const struct wsp_ggml_tensor * tensor);
+    WSP_GGML_API size_t  wsp_ggml_nbytes    (const struct wsp_ggml_tensor * tensor);
+    WSP_GGML_API size_t  wsp_ggml_nbytes_pad(const struct wsp_ggml_tensor * tensor); // same as wsp_ggml_nbytes() but padded to WSP_GGML_MEM_ALIGN
-    WSP_GGML_API WSP_GGML_CALL int    wsp_ggml_blck_size(enum wsp_ggml_type type);
-    WSP_GGML_API WSP_GGML_CALL size_t wsp_ggml_type_size(enum wsp_ggml_type type);             // size in bytes for all elements in a block
-    WSP_GGML_API WSP_GGML_CALL size_t wsp_ggml_row_size (enum wsp_ggml_type type, int64_t ne); // size in bytes for all elements in a row
+    WSP_GGML_API int64_t wsp_ggml_blck_size(enum wsp_ggml_type type);
+    WSP_GGML_API size_t  wsp_ggml_type_size(enum wsp_ggml_type type);             // size in bytes for all elements in a block
+    WSP_GGML_API size_t  wsp_ggml_row_size (enum wsp_ggml_type type, int64_t ne); // size in bytes for all elements in a row
     WSP_GGML_DEPRECATED(
     WSP_GGML_API double wsp_ggml_type_sizef(enum wsp_ggml_type type), // wsp_ggml_type_size()/wsp_ggml_blck_size() as float
     "use wsp_ggml_row_size() instead");
-    WSP_GGML_API WSP_GGML_CALL const char * wsp_ggml_type_name(enum wsp_ggml_type type);
-    WSP_GGML_API WSP_GGML_CALL const char * wsp_ggml_op_name  (enum wsp_ggml_op   op);
-    WSP_GGML_API           const char * wsp_ggml_op_symbol(enum wsp_ggml_op   op);
+    WSP_GGML_API const char * wsp_ggml_type_name(enum wsp_ggml_type type);
+    WSP_GGML_API const char * wsp_ggml_op_name  (enum wsp_ggml_op   op);
+    WSP_GGML_API const char * wsp_ggml_op_symbol(enum wsp_ggml_op   op);
-    WSP_GGML_API           const char * wsp_ggml_unary_op_name(enum wsp_ggml_unary_op op);
-    WSP_GGML_API WSP_GGML_CALL const char * wsp_ggml_op_desc(const struct wsp_ggml_tensor * t); // unary or op name
+    WSP_GGML_API const char * wsp_ggml_unary_op_name(enum wsp_ggml_unary_op op);
+    WSP_GGML_API const char * wsp_ggml_op_desc(const struct wsp_ggml_tensor * t); // unary or op name
-    WSP_GGML_API WSP_GGML_CALL size_t  wsp_ggml_element_size(const struct wsp_ggml_tensor * tensor);
+    WSP_GGML_API size_t  wsp_ggml_element_size(const struct wsp_ggml_tensor * tensor);
-    WSP_GGML_API WSP_GGML_CALL bool    wsp_ggml_is_quantized(enum wsp_ggml_type type);
+    WSP_GGML_API bool    wsp_ggml_is_quantized(enum wsp_ggml_type type);
     // TODO: temporary until model loading of ggml examples is refactored
     WSP_GGML_API enum wsp_ggml_type wsp_ggml_ftype_to_wsp_ggml_type(enum wsp_ggml_ftype ftype);
-    WSP_GGML_API WSP_GGML_CALL bool wsp_ggml_is_transposed(const struct wsp_ggml_tensor * tensor);
-    WSP_GGML_API WSP_GGML_CALL bool wsp_ggml_is_contiguous(const struct wsp_ggml_tensor * tensor);
-    WSP_GGML_API WSP_GGML_CALL bool wsp_ggml_is_permuted  (const struct wsp_ggml_tensor * tensor);
-    WSP_GGML_API           bool wsp_ggml_is_scalar    (const struct wsp_ggml_tensor * tensor);
-    WSP_GGML_API           bool wsp_ggml_is_vector    (const struct wsp_ggml_tensor * tensor);
-    WSP_GGML_API           bool wsp_ggml_is_matrix    (const struct wsp_ggml_tensor * tensor);
-    WSP_GGML_API           bool wsp_ggml_is_3d        (const struct wsp_ggml_tensor * tensor);
-    WSP_GGML_API           int  wsp_ggml_n_dims       (const struct wsp_ggml_tensor * tensor); // returns 1 for scalars
+    WSP_GGML_API bool wsp_ggml_is_transposed(const struct wsp_ggml_tensor * tensor);
+    WSP_GGML_API bool wsp_ggml_is_permuted  (const struct wsp_ggml_tensor * tensor);
+    WSP_GGML_API bool wsp_ggml_is_empty     (const struct wsp_ggml_tensor * tensor);
+    WSP_GGML_API bool wsp_ggml_is_scalar    (const struct wsp_ggml_tensor * tensor);
+    WSP_GGML_API bool wsp_ggml_is_vector    (const struct wsp_ggml_tensor * tensor);
+    WSP_GGML_API bool wsp_ggml_is_matrix    (const struct wsp_ggml_tensor * tensor);
+    WSP_GGML_API bool wsp_ggml_is_3d        (const struct wsp_ggml_tensor * tensor);
+    WSP_GGML_API int  wsp_ggml_n_dims       (const struct wsp_ggml_tensor * tensor); // returns 1 for scalars
-    WSP_GGML_API bool wsp_ggml_are_same_shape(const struct wsp_ggml_tensor * t0, const struct wsp_ggml_tensor * t1);
+    // returns whether the tensor elements can be iterated over with a flattened index (no gaps, no permutation)
+    WSP_GGML_API bool wsp_ggml_is_contiguous  (const struct wsp_ggml_tensor * tensor);
+    WSP_GGML_API bool wsp_ggml_is_contiguous_0(const struct wsp_ggml_tensor * tensor); // same as wsp_ggml_is_contiguous()
+    WSP_GGML_API bool wsp_ggml_is_contiguous_1(const struct wsp_ggml_tensor * tensor); // contiguous for dims >= 1
+    WSP_GGML_API bool wsp_ggml_is_contiguous_2(const struct wsp_ggml_tensor * tensor); // contiguous for dims >= 2
+    // returns whether the tensor elements are allocated as one contiguous block of memory (no gaps, but permutation ok)
+    WSP_GGML_API bool wsp_ggml_is_contiguously_allocated(const struct wsp_ggml_tensor * tensor);
+    // true for tensor that is stored in memory as CxWxHxN and has been permuted to WxHxCxN
+    WSP_GGML_API bool wsp_ggml_is_contiguous_channels(const struct wsp_ggml_tensor * tensor);
+    WSP_GGML_API bool wsp_ggml_are_same_shape (const struct wsp_ggml_tensor * t0, const struct wsp_ggml_tensor * t1);
+    WSP_GGML_API bool wsp_ggml_are_same_stride(const struct wsp_ggml_tensor * t0, const struct wsp_ggml_tensor * t1);
+    WSP_GGML_API bool wsp_ggml_can_repeat(const struct wsp_ggml_tensor * t0, const struct wsp_ggml_tensor * t1);
     // use this to compute the memory overhead of a tensor
     WSP_GGML_API size_t wsp_ggml_tensor_overhead(void);
+    WSP_GGML_API bool wsp_ggml_validate_row_data(enum wsp_ggml_type type, const void * data, size_t nbytes);
     // main
-    WSP_GGML_API struct wsp_ggml_context * wsp_ggml_init(struct wsp_ggml_init_params params);
-    WSP_GGML_API void                  wsp_ggml_free(struct wsp_ggml_context * ctx);
+    WSP_GGML_API struct wsp_ggml_context * wsp_ggml_init (struct wsp_ggml_init_params params);
+    WSP_GGML_API void                  wsp_ggml_reset(struct wsp_ggml_context * ctx);
+    WSP_GGML_API void                  wsp_ggml_free (struct wsp_ggml_context * ctx);
     WSP_GGML_API size_t  wsp_ggml_used_mem(const struct wsp_ggml_context * ctx);
-    WSP_GGML_API size_t  wsp_ggml_set_scratch (struct wsp_ggml_context * ctx, struct wsp_ggml_scratch scratch);
     WSP_GGML_API bool    wsp_ggml_get_no_alloc(struct wsp_ggml_context * ctx);
     WSP_GGML_API void    wsp_ggml_set_no_alloc(struct wsp_ggml_context * ctx, bool no_alloc);
@@ -747,8 +744,7 @@ extern "C" {
             int64_t ne2,
             int64_t ne3);
-    WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_new_i32(struct wsp_ggml_context * ctx, int32_t value);
-    WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_new_f32(struct wsp_ggml_context * ctx, float value);
+    WSP_GGML_API void * wsp_ggml_new_buffer(struct wsp_ggml_context * ctx, size_t nbytes);
     WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_dup_tensor (struct wsp_ggml_context * ctx, const struct wsp_ggml_tensor * src);
     WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_view_tensor(struct wsp_ggml_context * ctx, struct wsp_ggml_tensor * src);
@@ -758,35 +754,25 @@ extern "C" {
     WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_get_next_tensor (const struct wsp_ggml_context * ctx, struct wsp_ggml_tensor * tensor);
     WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_get_tensor(struct wsp_ggml_context * ctx, const char * name);
-    WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_set_zero(struct wsp_ggml_tensor * tensor);
-    WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_set_i32 (struct wsp_ggml_tensor * tensor, int32_t value);
-    WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_set_f32 (struct wsp_ggml_tensor * tensor, float value);
     // Converts a flat index into coordinates
-    WSP_GGML_API void    wsp_ggml_unravel_index(const struct wsp_ggml_tensor * tensor, int64_t i, int64_t * i0, int64_t * i1, int64_t * i2, int64_t * i3);
-    WSP_GGML_API int32_t wsp_ggml_get_i32_1d(const struct wsp_ggml_tensor * tensor, int i);
-    WSP_GGML_API void    wsp_ggml_set_i32_1d(const struct wsp_ggml_tensor * tensor, int i, int32_t value);
-    WSP_GGML_API int32_t wsp_ggml_get_i32_nd(const struct wsp_ggml_tensor * tensor, int i0, int i1, int i2, int i3);
-    WSP_GGML_API void    wsp_ggml_set_i32_nd(const struct wsp_ggml_tensor * tensor, int i0, int i1, int i2, int i3, int32_t value);
+    WSP_GGML_API void wsp_ggml_unravel_index(const struct wsp_ggml_tensor * tensor, int64_t i, int64_t * i0, int64_t * i1, int64_t * i2, int64_t * i3);
-    WSP_GGML_API float   wsp_ggml_get_f32_1d(const struct wsp_ggml_tensor * tensor, int i);
-    WSP_GGML_API void    wsp_ggml_set_f32_1d(const struct wsp_ggml_tensor * tensor, int i, float value);
-    WSP_GGML_API float   wsp_ggml_get_f32_nd(const struct wsp_ggml_tensor * tensor, int i0, int i1, int i2, int i3);
-    WSP_GGML_API void    wsp_ggml_set_f32_nd(const struct wsp_ggml_tensor * tensor, int i0, int i1, int i2, int i3, float value);
+    WSP_GGML_API enum wsp_ggml_unary_op wsp_ggml_get_unary_op(const struct wsp_ggml_tensor * tensor);
     WSP_GGML_API void *  wsp_ggml_get_data    (const struct wsp_ggml_tensor * tensor);
     WSP_GGML_API float * wsp_ggml_get_data_f32(const struct wsp_ggml_tensor * tensor);
-    WSP_GGML_API WSP_GGML_CALL enum wsp_ggml_unary_op wsp_ggml_get_unary_op(const struct wsp_ggml_tensor * tensor);
     WSP_GGML_API const char *         wsp_ggml_get_name   (const struct wsp_ggml_tensor * tensor);
     WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_set_name   (      struct wsp_ggml_tensor * tensor, const char * name);
     WSP_GGML_ATTRIBUTE_FORMAT(2, 3)
     WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_format_name(      struct wsp_ggml_tensor * tensor, const char * fmt, ...);
+    // Tensor flags
+    WSP_GGML_API void wsp_ggml_set_input(struct wsp_ggml_tensor * tensor);
+    WSP_GGML_API void wsp_ggml_set_output(struct wsp_ggml_tensor * tensor);
+    WSP_GGML_API void wsp_ggml_set_param(struct wsp_ggml_tensor * tensor);
+    WSP_GGML_API void wsp_ggml_set_loss(struct wsp_ggml_tensor * tensor);
     //
     // operations on tensors with backpropagation
     //
@@ -901,6 +887,22 @@ extern "C" {
             struct wsp_ggml_context * ctx,
             struct wsp_ggml_tensor  * a);
+    WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_sin(
+            struct wsp_ggml_context * ctx,
+            struct wsp_ggml_tensor  * a);
+    WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_sin_inplace(
+            struct wsp_ggml_context * ctx,
+            struct wsp_ggml_tensor  * a);
+    WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_cos(
+            struct wsp_ggml_context * ctx,
+            struct wsp_ggml_tensor  * a);
+    WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_cos_inplace(
+            struct wsp_ggml_context * ctx,
+            struct wsp_ggml_tensor  * a);
     // return scalar
     WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_sum(
             struct wsp_ggml_context * ctx,
@@ -921,6 +923,12 @@ extern "C" {
             struct wsp_ggml_context * ctx,
             struct wsp_ggml_tensor  * a);
+    // count number of equal elements in a and b
+    WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_count_equal(
+            struct wsp_ggml_context * ctx,
+            struct wsp_ggml_tensor  * a,
+            struct wsp_ggml_tensor  * b);
     // if a is the same shape as b, and a is not parameter, return a
     // otherwise, return a new tensor: repeat(a) to fit in b
     WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_repeat(
@@ -928,18 +936,28 @@ extern "C" {
             struct wsp_ggml_tensor  * a,
             struct wsp_ggml_tensor  * b);
+    // repeat a to the specified shape
+    WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_repeat_4d(
+            struct wsp_ggml_context * ctx,
+            struct wsp_ggml_tensor  * a,
+                       int64_t    ne0,
+                       int64_t    ne1,
+                       int64_t    ne2,
+                       int64_t    ne3);
     // sums repetitions in a into shape of b
     WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_repeat_back(
             struct wsp_ggml_context * ctx,
             struct wsp_ggml_tensor  * a,
-            struct wsp_ggml_tensor  * b);
+            struct wsp_ggml_tensor  * b); // sum up values that are adjacent in dims > 0 instead of repeated with same stride
-    // concat a and b on dim 2
+    // concat a and b along dim
     // used in stable-diffusion
     WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_concat(
             struct wsp_ggml_context * ctx,
             struct wsp_ggml_tensor  * a,
-            struct wsp_ggml_tensor  * b);
+            struct wsp_ggml_tensor  * b,
+            int                   dim);
     WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_abs(
             struct wsp_ggml_context * ctx,
@@ -1001,6 +1019,14 @@ extern "C" {
             struct wsp_ggml_context * ctx,
             struct wsp_ggml_tensor  * a);
+    WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_sigmoid(
+            struct wsp_ggml_context * ctx,
+            struct wsp_ggml_tensor  * a);
+    WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_sigmoid_inplace(
+            struct wsp_ggml_context * ctx,
+            struct wsp_ggml_tensor  * a);
     WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_gelu(
             struct wsp_ggml_context * ctx,
             struct wsp_ggml_tensor  * a);
@@ -1009,6 +1035,16 @@ extern "C" {
             struct wsp_ggml_context * ctx,
             struct wsp_ggml_tensor  * a);
+    // GELU using erf (error function) when possible
+    // some backends may fallback to approximation based on Abramowitz and Stegun formula
+    WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_gelu_erf(
+            struct wsp_ggml_context * ctx,
+            struct wsp_ggml_tensor  * a);
+    WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_gelu_erf_inplace(
+            struct wsp_ggml_context * ctx,
+            struct wsp_ggml_tensor  * a);
     WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_gelu_quick(
             struct wsp_ggml_context * ctx,
             struct wsp_ggml_tensor  * a);
@@ -1032,6 +1068,24 @@ extern "C" {
             struct wsp_ggml_tensor  * a,
             struct wsp_ggml_tensor  * b);
+    // hardswish(x) = x * relu6(x + 3) / 6
+    WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_hardswish(
+            struct wsp_ggml_context * ctx,
+            struct wsp_ggml_tensor  * a);
+    // hardsigmoid(x) = relu6(x + 3) / 6
+    WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_hardsigmoid(
+            struct wsp_ggml_context * ctx,
+            struct wsp_ggml_tensor  * a);
+    WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_exp(
+            struct wsp_ggml_context * ctx,
+            struct wsp_ggml_tensor  * a);
+    WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_exp_inplace(
+            struct wsp_ggml_context * ctx,
+            struct wsp_ggml_tensor  * a);
     // normalize along rows
     WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_norm(
             struct wsp_ggml_context * ctx,
@@ -1055,16 +1109,29 @@ extern "C" {
     // group normalize along ne0*ne1*n_groups
     // used in stable-diffusion
-    // TODO: eps is hardcoded to 1e-6 for now
     WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_group_norm(
             struct wsp_ggml_context * ctx,
             struct wsp_ggml_tensor  * a,
-            int                   n_groups);
+            int                   n_groups,
+            float                 eps);
     WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_group_norm_inplace(
             struct wsp_ggml_context * ctx,
             struct wsp_ggml_tensor  * a,
-            int                   n_groups);
+            int                   n_groups,
+            float                 eps);
+    // l2 normalize along rows
+    // used in rwkv v7
+    WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_l2_norm(
+            struct wsp_ggml_context * ctx,
+            struct wsp_ggml_tensor  * a,
+            float                 eps);
+    WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_l2_norm_inplace(
+            struct wsp_ggml_context * ctx,
+            struct wsp_ggml_tensor  * a,
+            float                 eps);
     // a - x
     // b - dy
@@ -1089,14 +1156,11 @@ extern "C" {
             enum wsp_ggml_prec       prec);
     // indirect matrix multiplication
-    //  wsp_ggml_mul_mat_id(ctx, as, ids, id, b) ~= wsp_ggml_mul_mat(as[ids[id]], b)
     WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_mul_mat_id(
             struct wsp_ggml_context * ctx,
-            struct wsp_ggml_tensor  * const as[],
-            int                   n_as,
-            struct wsp_ggml_tensor  * ids,
-            int                   id,
-            struct wsp_ggml_tensor  * b);
+            struct wsp_ggml_tensor  * as,
+            struct wsp_ggml_tensor  * b,
+            struct wsp_ggml_tensor  * ids);
     // A: m columns, n rows,
     // B: p columns, n rows,
@@ -1129,7 +1193,7 @@ extern "C" {
             size_t                nb1,
             size_t                nb2,
             size_t                nb3,
-            size_t                offset);
+            size_t                offset); // in bytes
     // b -> view(a,offset,nb1,nb2,3), return view(a)
     WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_set_inplace(
@@ -1139,19 +1203,19 @@ extern "C" {
             size_t                nb1,
             size_t                nb2,
             size_t                nb3,
-            size_t                offset);
+            size_t                offset); // in bytes
     WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_set_1d(
             struct wsp_ggml_context * ctx,
             struct wsp_ggml_tensor  * a,
             struct wsp_ggml_tensor  * b,
-            size_t                offset);
+            size_t                offset); // in bytes
     WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_set_1d_inplace(
             struct wsp_ggml_context * ctx,
             struct wsp_ggml_tensor  * a,
             struct wsp_ggml_tensor  * b,
-            size_t                offset);
+            size_t                offset); // in bytes
     // b -> view(a,offset,nb1,nb2,3), return modified a
     WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_set_2d(
@@ -1159,7 +1223,7 @@ extern "C" {
             struct wsp_ggml_tensor  * a,
             struct wsp_ggml_tensor  * b,
             size_t                nb1,
-            size_t                offset);
+            size_t                offset); // in bytes
     // b -> view(a,offset,nb1,nb2,3), return view(a)
     WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_set_2d_inplace(
@@ -1167,7 +1231,7 @@ extern "C" {
             struct wsp_ggml_tensor  * a,
             struct wsp_ggml_tensor  * b,
             size_t                nb1,
-            size_t                offset);
+            size_t                offset); // in bytes
     // a -> b, return view(b)
     WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_cpy(
@@ -1302,14 +1366,14 @@ extern "C" {
     // supports 3D: a->ne[2] == b->ne[1]
     WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_get_rows(
             struct wsp_ggml_context * ctx,
-            struct wsp_ggml_tensor  * a,
-            struct wsp_ggml_tensor  * b);
+            struct wsp_ggml_tensor  * a,  // data
+            struct wsp_ggml_tensor  * b); // row indices
     WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_get_rows_back(
             struct wsp_ggml_context * ctx,
-            struct wsp_ggml_tensor  * a,
-            struct wsp_ggml_tensor  * b,
-            struct wsp_ggml_tensor  * c);
+            struct wsp_ggml_tensor  * a,  // gradients of wsp_ggml_get_rows result
+            struct wsp_ggml_tensor  * b,  // row indices
+            struct wsp_ggml_tensor  * c); // data for wsp_ggml_get_rows, only used for its shape
     WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_diag(
         struct wsp_ggml_context     * ctx,
@@ -1348,29 +1412,34 @@ extern "C" {
             struct wsp_ggml_context * ctx,
             struct wsp_ggml_tensor  * a);
-    // fused soft_max(a*scale + mask)
+    // fused soft_max(a*scale + mask*(ALiBi slope))
     // mask is optional
+    // max_bias = 0.0f for no ALiBi
     WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_soft_max_ext(
             struct wsp_ggml_context * ctx,
             struct wsp_ggml_tensor  * a,
             struct wsp_ggml_tensor  * mask,
-            float                 scale);
+            float                 scale,
+            float                 max_bias);
-    WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_soft_max_back(
+    WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_soft_max_ext_back(
             struct wsp_ggml_context * ctx,
             struct wsp_ggml_tensor  * a,
-            struct wsp_ggml_tensor  * b);
+            struct wsp_ggml_tensor  * b,
+            float                 scale,
+            float                 max_bias);
     // in-place, returns view(a)
-    WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_soft_max_back_inplace(
+    WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_soft_max_ext_back_inplace(
             struct wsp_ggml_context * ctx,
             struct wsp_ggml_tensor  * a,
-            struct wsp_ggml_tensor  * b);
+            struct wsp_ggml_tensor  * b,
+            float                 scale,
+            float                 max_bias);
     // rotary position embedding
-    // if mode & 1 == 1, skip n_past elements (DEPRECATED)
-    // if mode & 2 == 1, GPT-NeoX style
-    // if mode & 4 == 1, ChatGLM style
+    // if (mode & 1) - skip n_past elements (NOT SUPPORTED)
+    // if (mode & WSP_GGML_ROPE_TYPE_NEOX) - GPT-NeoX style
     //
     // b is an int32 vector with size a->ne[2], it contains the positions
     WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_rope(
@@ -1378,8 +1447,7 @@ extern "C" {
             struct wsp_ggml_tensor  * a,
             struct wsp_ggml_tensor  * b,
             int                   n_dims,
-            int                   mode,
-            int                   n_ctx);
+            int                   mode);
     // in-place, returns view(a)
     WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_rope_inplace(
@@ -1387,18 +1455,34 @@ extern "C" {
             struct wsp_ggml_tensor  * a,
             struct wsp_ggml_tensor  * b,
             int                   n_dims,
-            int                   mode,
-            int                   n_ctx);
+            int                   mode);
     // custom RoPE
-    WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_rope_custom(
+    // c is freq factors (e.g. phi3-128k), (optional)
+    WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_rope_ext(
             struct wsp_ggml_context * ctx,
             struct wsp_ggml_tensor  * a,
             struct wsp_ggml_tensor  * b,
+            struct wsp_ggml_tensor  * c,
             int                   n_dims,
             int                   mode,
-            int                   n_ctx,
-            int                   n_orig_ctx,
+            int                   n_ctx_orig,
+            float                 freq_base,
+            float                 freq_scale,
+            float                 ext_factor,
+            float                 attn_factor,
+            float                 beta_fast,
+            float                 beta_slow);
+    WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_rope_multi(
+            struct wsp_ggml_context * ctx,
+            struct wsp_ggml_tensor  * a,
+            struct wsp_ggml_tensor  * b,
+            struct wsp_ggml_tensor  * c,
+            int                   n_dims,
+            int                   sections[4],
+            int                   mode,
+            int                   n_ctx_orig,
             float                 freq_base,
             float                 freq_scale,
             float                 ext_factor,
@@ -1407,14 +1491,14 @@ extern "C" {
             float                 beta_slow);
     // in-place, returns view(a)
-    WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_rope_custom_inplace(
+    WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_rope_ext_inplace(
             struct wsp_ggml_context * ctx,
             struct wsp_ggml_tensor  * a,
             struct wsp_ggml_tensor  * b,
+            struct wsp_ggml_tensor  * c,
             int                   n_dims,
             int                   mode,
-            int                   n_ctx,
-            int                   n_orig_ctx,
+            int                   n_ctx_orig,
             float                 freq_base,
             float                 freq_scale,
             float                 ext_factor,
@@ -1422,46 +1506,73 @@ extern "C" {
             float                 beta_fast,
             float                 beta_slow);
-    // compute correction dims for YaRN RoPE scaling
-    WSP_GGML_CALL void wsp_ggml_rope_yarn_corr_dims(
-        int n_dims, int n_orig_ctx, float freq_base, float beta_fast, float beta_slow, float dims[2]);
+    WSP_GGML_DEPRECATED(WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_rope_custom(
+            struct wsp_ggml_context * ctx,
+            struct wsp_ggml_tensor  * a,
+            struct wsp_ggml_tensor  * b,
+            int                   n_dims,
+            int                   mode,
+            int                   n_ctx_orig,
+            float                 freq_base,
+            float                 freq_scale,
+            float                 ext_factor,
+            float                 attn_factor,
+            float                 beta_fast,
+            float                 beta_slow),
+        "use wsp_ggml_rope_ext instead");
-    // xPos RoPE, in-place, returns view(a)
-    WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_rope_xpos_inplace(
+    WSP_GGML_DEPRECATED(WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_rope_custom_inplace(
             struct wsp_ggml_context * ctx,
             struct wsp_ggml_tensor  * a,
             struct wsp_ggml_tensor  * b,
             int                   n_dims,
-            float                 base,
-            bool                  down);
+            int                   mode,
+            int                   n_ctx_orig,
+            float                 freq_base,
+            float                 freq_scale,
+            float                 ext_factor,
+            float                 attn_factor,
+            float                 beta_fast,
+            float                 beta_slow),
+        "use wsp_ggml_rope_ext_inplace instead");
+    // compute correction dims for YaRN RoPE scaling
+    WSP_GGML_API void wsp_ggml_rope_yarn_corr_dims(
+        int n_dims, int n_ctx_orig, float freq_base, float beta_fast, float beta_slow, float dims[2]);
     // rotary position embedding backward, i.e compute dx from dy
     // a - dy
-    WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_rope_back(
+    WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_rope_ext_back(
             struct wsp_ggml_context * ctx,
-            struct wsp_ggml_tensor  * a,
-            struct wsp_ggml_tensor  * b,
+            struct wsp_ggml_tensor  * a, // gradients of wsp_ggml_rope result
+            struct wsp_ggml_tensor  * b, // positions
+            struct wsp_ggml_tensor  * c, // freq factors
             int                   n_dims,
             int                   mode,
-            int                   n_ctx,
-            int                   n_orig_ctx,
+            int                   n_ctx_orig,
             float                 freq_base,
             float                 freq_scale,
             float                 ext_factor,
             float                 attn_factor,
             float                 beta_fast,
-            float                 beta_slow,
-            float                 xpos_base,
-            bool                  xpos_down);
+            float                 beta_slow);
-    // alibi position embedding
-    // in-place, returns view(a)
-    WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_alibi(
+    WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_rope_multi_back(
             struct wsp_ggml_context * ctx,
             struct wsp_ggml_tensor  * a,
-            int                   n_past,
-            int                   n_head,
-            float                 bias_max);
+            struct wsp_ggml_tensor  * b,
+            struct wsp_ggml_tensor  * c,
+            int                   n_dims,
+            int                   sections[4],
+            int                   mode,
+            int                   n_ctx_orig,
+            float                 freq_base,
+            float                 freq_scale,
+            float                 ext_factor,
+            float                 attn_factor,
+            float                 beta_fast,
+            float                 beta_slow);
     // clamp
     // in-place, returns view(a)
@@ -1471,22 +1582,38 @@ extern "C" {
             float                 min,
             float                 max);
+    // im2col
+    // converts data into a format that effectively results in a convolution when combined with matrix multiplication
     WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_im2col(
             struct wsp_ggml_context * ctx,
-            struct wsp_ggml_tensor  * a,
-            struct wsp_ggml_tensor  * b,
-            int                  s0,
-            int                  s1,
-            int                  p0,
-            int                  p1,
-            int                  d0,
-            int                  d1,
-            bool                 is_2D);
+            struct wsp_ggml_tensor  * a,  // convolution kernel
+            struct wsp_ggml_tensor  * b,  // data
+            int                   s0, // stride dimension 0
+            int                   s1, // stride dimension 1
+            int                   p0, // padding dimension 0
+            int                   p1, // padding dimension 1
+            int                   d0, // dilation dimension 0
+            int                   d1, // dilation dimension 1
+            bool                  is_2D,
+            enum wsp_ggml_type        dst_type);
+    WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_im2col_back(
+        struct wsp_ggml_context * ctx,
+        struct wsp_ggml_tensor  * a,  // convolution kernel
+        struct wsp_ggml_tensor  * b,  // gradient of im2col output
+        int64_t             * ne, // shape of im2col input
+        int                   s0, // stride dimension 0
+        int                   s1, // stride dimension 1
+        int                   p0, // padding dimension 0
+        int                   p1, // padding dimension 1
+        int                   d0, // dilation dimension 0
+        int                   d1, // dilation dimension 1
+        bool                  is_2D);
     WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_conv_1d(
             struct wsp_ggml_context * ctx,
-            struct wsp_ggml_tensor  * a,
-            struct wsp_ggml_tensor  * b,
+            struct wsp_ggml_tensor  * a,   // convolution kernel
+            struct wsp_ggml_tensor  * b,   // data
             int                   s0,  // stride
             int                   p0,  // padding
             int                   d0); // dilation
@@ -1495,30 +1622,46 @@ extern "C" {
     // alias for wsp_ggml_conv_1d(a, b, s, a->ne[0]/2, d)
     WSP_GGML_API struct wsp_ggml_tensor* wsp_ggml_conv_1d_ph(
             struct wsp_ggml_context * ctx,
-            struct wsp_ggml_tensor  * a,
-            struct wsp_ggml_tensor  * b,
-            int                   s,
-            int                   d);
+            struct wsp_ggml_tensor  * a,  // convolution kernel
+            struct wsp_ggml_tensor  * b,  // data
+            int                   s,  // stride
+            int                   d); // dilation
+    // depthwise
+    // TODO: this is very likely wrong for some cases! - needs more testing
+    WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_conv_1d_dw(
+            struct wsp_ggml_context * ctx,
+            struct wsp_ggml_tensor  * a,   // convolution kernel
+            struct wsp_ggml_tensor  * b,   // data
+            int                   s0,  // stride
+            int                   p0,  // padding
+            int                   d0); // dilation
+    WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_conv_1d_dw_ph(
+            struct wsp_ggml_context * ctx,
+            struct wsp_ggml_tensor  * a,   // convolution kernel
+            struct wsp_ggml_tensor  * b,   // data
+            int                   s0,  // stride
+            int                   d0); // dilation
     WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_conv_transpose_1d(
             struct wsp_ggml_context * ctx,
-            struct wsp_ggml_tensor  * a,
-            struct wsp_ggml_tensor  * b,
-            int                   s0,
-            int                   p0,
-            int                   d0);
+            struct wsp_ggml_tensor  * a,   // convolution kernel
+            struct wsp_ggml_tensor  * b,   // data
+            int                   s0,  // stride
+            int                   p0,  // padding
+            int                   d0); // dilation
     WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_conv_2d(
             struct wsp_ggml_context * ctx,
-            struct wsp_ggml_tensor  * a,
-            struct wsp_ggml_tensor  * b,
-            int                   s0,
-            int                   s1,
-            int                   p0,
-            int                   p1,
-            int                   d0,
-            int                   d1);
+            struct wsp_ggml_tensor  * a,   // convolution kernel
+            struct wsp_ggml_tensor  * b,   // data
+            int                   s0,  // stride dimension 0
+            int                   s1,  // stride dimension 1
+            int                   p0,  // padding dimension 0
+            int                   p1,  // padding dimension 1
+            int                   d0,  // dilation dimension 0
+            int                   d1); // dilation dimension 1
     // kernel size is a->ne[0] x a->ne[1]
     // stride is equal to kernel size
@@ -1546,6 +1689,34 @@ extern "C" {
             struct wsp_ggml_tensor  * a,
             struct wsp_ggml_tensor  * b);
+    // depthwise (via im2col and mul_mat)
+    WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_conv_2d_dw(
+            struct wsp_ggml_context * ctx,
+            struct wsp_ggml_tensor  * a,  // convolution kernel
+            struct wsp_ggml_tensor  * b,  // data
+            int                  s0,  // stride dimension 0
+            int                  s1,  // stride dimension 1
+            int                  p0,  // padding dimension 0
+            int                  p1,  // padding dimension 1
+            int                  d0,  // dilation dimension 0
+            int                  d1); // dilation dimension 1
+    // Depthwise 2D convolution
+    // may be faster than wsp_ggml_conv_2d_dw, but not available in all backends
+    // a:   KW    KH    1    C    convolution kernel
+    // b:   W     H     C    N    input data
+    // res: W_out H_out C    N
+    WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_conv_2d_dw_direct(
+            struct wsp_ggml_context * ctx,
+            struct wsp_ggml_tensor  * a,
+            struct wsp_ggml_tensor  * b,
+            int                   stride0,
+            int                   stride1,
+            int                   pad0,
+            int                   pad1,
+            int                   dilation0,
+            int                   dilation1);
     WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_conv_transpose_2d_p0(
             struct wsp_ggml_context * ctx,
             struct wsp_ggml_tensor  * a,
@@ -1579,12 +1750,41 @@ extern "C" {
             float                 p0,
             float                 p1);
-    // nearest interpolate
-    // used in stable-diffusion
+    WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_pool_2d_back(
+            struct wsp_ggml_context * ctx,
+            struct wsp_ggml_tensor  * a,
+            struct wsp_ggml_tensor  * af, // "a"/input used in forward pass
+            enum wsp_ggml_op_pool     op,
+            int                   k0,
+            int                   k1,
+            int                   s0,
+            int                   s1,
+            float                 p0,
+            float                 p1);
+    enum wsp_ggml_scale_mode {
+        WSP_GGML_SCALE_MODE_NEAREST  = 0,
+        WSP_GGML_SCALE_MODE_BILINEAR = 1,
+    };
+    // interpolate
+    // multiplies ne0 and ne1 by scale factor
     WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_upscale(
             struct wsp_ggml_context * ctx,
             struct wsp_ggml_tensor  * a,
-            int                   scale_factor);
+            int                   scale_factor,
+            enum wsp_ggml_scale_mode  mode);
+    // interpolate
+    // interpolate scale to specified dimensions
+    WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_upscale_ext(
+            struct wsp_ggml_context * ctx,
+            struct wsp_ggml_tensor  * a,
+            int                   ne0,
+            int                   ne1,
+            int                   ne2,
+            int                   ne3,
+            enum wsp_ggml_scale_mode  mode);
     // pad each dimension with zeros: [x, ..., x] -> [x, ..., x, 0, ..., 0]
     WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_pad(
@@ -1595,10 +1795,37 @@ extern "C" {
             int                  p2,
             int                  p3);
+    // pad each dimension with reflection: [a, b, c, d] -> [b, a, b, c, d, c]
+    WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_pad_reflect_1d(
+            struct wsp_ggml_context * ctx,
+            struct wsp_ggml_tensor  * a,
+            int                   p0,
+            int                   p1);
+    // Move tensor elements by an offset given for each dimension. Elements that
+    // are shifted beyond the last position are wrapped around to the beginning.
+    WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_roll(
+            struct wsp_ggml_context * ctx,
+            struct wsp_ggml_tensor  * a,
+            int                   shift0,
+            int                   shift1,
+            int                   shift2,
+            int                   shift3);
+    // Ref: https://github.com/CompVis/stable-diffusion/blob/main/ldm/modules/diffusionmodules/util.py#L151
+    // timesteps: [N,]
+    // return: [N, dim]
+    WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_timestep_embedding(
+            struct wsp_ggml_context * ctx,
+            struct wsp_ggml_tensor  * timesteps,
+            int                   dim,
+            int                   max_period);
     // sort rows
     enum wsp_ggml_sort_order {
-        WSP_GGML_SORT_ASC,
-        WSP_GGML_SORT_DESC,
+        WSP_GGML_SORT_ORDER_ASC,
+        WSP_GGML_SORT_ORDER_DESC,
     };
     WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_argsort(
@@ -1606,19 +1833,43 @@ extern "C" {
             struct wsp_ggml_tensor  * a,
             enum wsp_ggml_sort_order  order);
+    WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_arange(
+            struct wsp_ggml_context * ctx,
+            float                 start,
+            float                 stop,
+            float                 step);
     // top k elements per row
     WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_top_k(
             struct wsp_ggml_context * ctx,
             struct wsp_ggml_tensor  * a,
             int                   k);
-    WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_flash_attn(
+#define WSP_GGML_KQ_MASK_PAD 64
+    // q:    [n_embd_k, n_batch,     n_head,    1]
+    // k:    [n_embd_k, n_kv,        n_head_kv, 1]
+    // v:    [n_embd_v, n_kv,        n_head_kv, 1] !! not transposed !!
+    // mask: [n_kv,     n_batch_pad, 1,         1] !! n_batch_pad = WSP_GGML_PAD(n_batch, WSP_GGML_KQ_MASK_PAD) !!
+    // res:  [n_embd_v, n_head,      n_batch,   1] !! permuted !!
+    WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_flash_attn_ext(
             struct wsp_ggml_context * ctx,
             struct wsp_ggml_tensor  * q,
             struct wsp_ggml_tensor  * k,
             struct wsp_ggml_tensor  * v,
-            bool                  masked);
+            struct wsp_ggml_tensor  * mask,
+            float                 scale,
+            float                 max_bias,
+            float                 logit_softcap);
+    WSP_GGML_API void wsp_ggml_flash_attn_ext_set_prec(
+            struct wsp_ggml_tensor * a,
+            enum wsp_ggml_prec       prec);
+    WSP_GGML_API enum wsp_ggml_prec wsp_ggml_flash_attn_ext_get_prec(
+            const struct wsp_ggml_tensor * a);
+    // TODO: needs to be adapted to wsp_ggml_flash_attn_ext
     WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_flash_attn_back(
            struct wsp_ggml_context * ctx,
            struct wsp_ggml_tensor  * q,
@@ -1627,13 +1878,19 @@ extern "C" {
            struct wsp_ggml_tensor  * d,
            bool                  masked);
-    WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_flash_ff(
+    WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_ssm_conv(
             struct wsp_ggml_context * ctx,
-            struct wsp_ggml_tensor  * a,
-            struct wsp_ggml_tensor  * b0,
-            struct wsp_ggml_tensor  * b1,
-            struct wsp_ggml_tensor  * c0,
-            struct wsp_ggml_tensor  * c1);
+            struct wsp_ggml_tensor  * sx,
+            struct wsp_ggml_tensor  * c);
+    WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_ssm_scan(
+            struct wsp_ggml_context * ctx,
+            struct wsp_ggml_tensor  * s,
+            struct wsp_ggml_tensor  * x,
+            struct wsp_ggml_tensor  * dt,
+            struct wsp_ggml_tensor  * A,
+            struct wsp_ggml_tensor  * B,
+            struct wsp_ggml_tensor  * C);
     // partition into non-overlapping windows with padding if needed
     // example:
@@ -1685,90 +1942,42 @@ extern "C" {
             struct wsp_ggml_tensor  * pw,
             struct wsp_ggml_tensor  * ph);
-    // custom operators
+    WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_rwkv_wkv6(
+            struct wsp_ggml_context * ctx,
+            struct wsp_ggml_tensor  * k,
+            struct wsp_ggml_tensor  * v,
+            struct wsp_ggml_tensor  * r,
+            struct wsp_ggml_tensor  * tf,
+            struct wsp_ggml_tensor  * td,
+            struct wsp_ggml_tensor  * state);
+    WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_gated_linear_attn(
+            struct wsp_ggml_context * ctx,
+            struct wsp_ggml_tensor  * k,
+            struct wsp_ggml_tensor  * v,
+            struct wsp_ggml_tensor  * q,
+            struct wsp_ggml_tensor  * g,
+            struct wsp_ggml_tensor  * state,
+            float scale);
-    typedef void (*wsp_ggml_unary_op_f32_t) (const int, float *, const float *);
-    typedef void (*wsp_ggml_binary_op_f32_t)(const int, float *, const float *, const float *);
-    typedef void (*wsp_ggml_custom1_op_f32_t)(struct wsp_ggml_tensor *, const struct wsp_ggml_tensor *);
-    typedef void (*wsp_ggml_custom2_op_f32_t)(struct wsp_ggml_tensor *, const struct wsp_ggml_tensor *, const struct wsp_ggml_tensor *);
-    typedef void (*wsp_ggml_custom3_op_f32_t)(struct wsp_ggml_tensor *, const struct wsp_ggml_tensor *, const struct wsp_ggml_tensor *, const struct wsp_ggml_tensor *);
-    WSP_GGML_DEPRECATED(WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_map_unary_f32(
-            struct wsp_ggml_context        * ctx,
-            struct wsp_ggml_tensor         * a,
-                   wsp_ggml_unary_op_f32_t   fun),
-        "use wsp_ggml_map_custom1 instead");
-    WSP_GGML_DEPRECATED(WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_map_unary_inplace_f32(
-            struct wsp_ggml_context        * ctx,
-            struct wsp_ggml_tensor         * a,
-                   wsp_ggml_unary_op_f32_t   fun),
-        "use wsp_ggml_map_custom1_inplace instead");
-    WSP_GGML_DEPRECATED(WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_map_binary_f32(
-            struct wsp_ggml_context         * ctx,
-            struct wsp_ggml_tensor          * a,
-            struct wsp_ggml_tensor          * b,
-                   wsp_ggml_binary_op_f32_t   fun),
-        "use wsp_ggml_map_custom2 instead");
-    WSP_GGML_DEPRECATED(WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_map_binary_inplace_f32(
-            struct wsp_ggml_context         * ctx,
-            struct wsp_ggml_tensor          * a,
-            struct wsp_ggml_tensor          * b,
-                   wsp_ggml_binary_op_f32_t   fun),
-        "use wsp_ggml_map_custom2_inplace instead");
-    WSP_GGML_DEPRECATED(WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_map_custom1_f32(
-            struct wsp_ggml_context          * ctx,
-            struct wsp_ggml_tensor           * a,
-                   wsp_ggml_custom1_op_f32_t   fun),
-        "use wsp_ggml_map_custom1 instead");
-    WSP_GGML_DEPRECATED(WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_map_custom1_inplace_f32(
-            struct wsp_ggml_context          * ctx,
-            struct wsp_ggml_tensor           * a,
-                   wsp_ggml_custom1_op_f32_t   fun),
-        "use wsp_ggml_map_custom1_inplace instead");
-    WSP_GGML_DEPRECATED(WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_map_custom2_f32(
-            struct wsp_ggml_context          * ctx,
-            struct wsp_ggml_tensor           * a,
-            struct wsp_ggml_tensor           * b,
-                   wsp_ggml_custom2_op_f32_t   fun),
-        "use wsp_ggml_map_custom2 instead");
-    WSP_GGML_DEPRECATED(WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_map_custom2_inplace_f32(
-            struct wsp_ggml_context          * ctx,
-            struct wsp_ggml_tensor           * a,
-            struct wsp_ggml_tensor           * b,
-                   wsp_ggml_custom2_op_f32_t   fun),
-        "use wsp_ggml_map_custom2_inplace instead");
-    WSP_GGML_DEPRECATED(WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_map_custom3_f32(
-            struct wsp_ggml_context          * ctx,
-            struct wsp_ggml_tensor           * a,
-            struct wsp_ggml_tensor           * b,
-            struct wsp_ggml_tensor           * c,
-                   wsp_ggml_custom3_op_f32_t   fun),
-        "use wsp_ggml_map_custom3 instead");
-    WSP_GGML_DEPRECATED(WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_map_custom3_inplace_f32(
-            struct wsp_ggml_context          * ctx,
-            struct wsp_ggml_tensor           * a,
-            struct wsp_ggml_tensor           * b,
-            struct wsp_ggml_tensor           * c,
-                   wsp_ggml_custom3_op_f32_t   fun),
-        "use wsp_ggml_map_custom3_inplace instead");
-    // custom operators v2
+    WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_rwkv_wkv7(
+            struct wsp_ggml_context * ctx,
+            struct wsp_ggml_tensor  * r,
+            struct wsp_ggml_tensor  * w,
+            struct wsp_ggml_tensor  * k,
+            struct wsp_ggml_tensor  * v,
+            struct wsp_ggml_tensor  * a,
+            struct wsp_ggml_tensor  * b,
+            struct wsp_ggml_tensor  * state);
+    // custom operators
     typedef void (*wsp_ggml_custom1_op_t)(struct wsp_ggml_tensor * dst , const struct wsp_ggml_tensor * a, int ith, int nth, void * userdata);
     typedef void (*wsp_ggml_custom2_op_t)(struct wsp_ggml_tensor * dst , const struct wsp_ggml_tensor * a, const struct wsp_ggml_tensor * b, int ith, int nth, void * userdata);
     typedef void (*wsp_ggml_custom3_op_t)(struct wsp_ggml_tensor * dst , const struct wsp_ggml_tensor * a, const struct wsp_ggml_tensor * b, const struct wsp_ggml_tensor * c, int ith, int nth, void * userdata);
-    #define WSP_GGML_N_TASKS_MAX -1
+#define WSP_GGML_N_TASKS_MAX (-1)
+    // n_tasks == WSP_GGML_N_TASKS_MAX means to use max number of tasks
     WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_map_custom1(
             struct wsp_ggml_context   * ctx,
@@ -1818,56 +2027,85 @@ extern "C" {
             int                     n_tasks,
             void                  * userdata);
+    typedef void (*wsp_ggml_custom_op_t)(struct wsp_ggml_tensor * dst , int ith, int nth, void * userdata);
+    WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_custom_4d(
+            struct wsp_ggml_context * ctx,
+            enum wsp_ggml_type        type,
+            int64_t               ne0,
+            int64_t               ne1,
+            int64_t               ne2,
+            int64_t               ne3,
+            struct wsp_ggml_tensor ** args,
+            int                   n_args,
+            wsp_ggml_custom_op_t      fun,
+            int                   n_tasks,
+            void                * userdata);
+    WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_custom_inplace(
+            struct wsp_ggml_context * ctx,
+            struct wsp_ggml_tensor  * a,
+            struct wsp_ggml_tensor ** args,
+            int                   n_args,
+            wsp_ggml_custom_op_t      fun,
+            int                   n_tasks,
+            void                * userdata);
     // loss function
     WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_cross_entropy_loss(
-            struct wsp_ggml_context         * ctx,
-            struct wsp_ggml_tensor          * a,
-            struct wsp_ggml_tensor          * b);
+            struct wsp_ggml_context * ctx,
+            struct wsp_ggml_tensor  * a,  // logits
+            struct wsp_ggml_tensor  * b); // labels
     WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_cross_entropy_loss_back(
-            struct wsp_ggml_context         * ctx,
-            struct wsp_ggml_tensor          * a,
-            struct wsp_ggml_tensor          * b,
-            struct wsp_ggml_tensor          * c);
+            struct wsp_ggml_context * ctx,
+            struct wsp_ggml_tensor  * a,  // logits
+            struct wsp_ggml_tensor  * b,  // labels
+            struct wsp_ggml_tensor  * c); // gradients of cross_entropy_loss result
+    // AdamW optimizer step
+    // Paper: https://arxiv.org/pdf/1711.05101v3.pdf
+    // PyTorch: https://pytorch.org/docs/stable/generated/torch.optim.AdamW.html
+    WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_opt_step_adamw(
+            struct wsp_ggml_context * ctx,
+            struct wsp_ggml_tensor  * a,
+            struct wsp_ggml_tensor  * grad,
+            struct wsp_ggml_tensor  * m,
+            struct wsp_ggml_tensor  * v,
+            struct wsp_ggml_tensor  * adamw_params); // parameters such a the learning rate
     //
     // automatic differentiation
     //
-    WSP_GGML_API void wsp_ggml_set_param(
-            struct wsp_ggml_context * ctx,
-            struct wsp_ggml_tensor  * tensor);
+    WSP_GGML_API void wsp_ggml_build_forward_expand(struct wsp_ggml_cgraph * cgraph, struct wsp_ggml_tensor * tensor);
+    WSP_GGML_API void wsp_ggml_build_backward_expand(
+        struct wsp_ggml_context *  ctx,        // context for gradient computation
+        struct wsp_ggml_cgraph  *  cgraph,
+        struct wsp_ggml_tensor  ** grad_accs);
+    // graph allocation in a context
+    WSP_GGML_API struct wsp_ggml_cgraph * wsp_ggml_new_graph       (struct wsp_ggml_context * ctx); // size = WSP_GGML_DEFAULT_GRAPH_SIZE, grads = false
+    WSP_GGML_API struct wsp_ggml_cgraph * wsp_ggml_new_graph_custom(struct wsp_ggml_context * ctx, size_t size, bool grads);
+    WSP_GGML_API struct wsp_ggml_cgraph * wsp_ggml_graph_dup       (struct wsp_ggml_context * ctx, struct wsp_ggml_cgraph * cgraph, bool force_grads);
+    WSP_GGML_API void                 wsp_ggml_graph_cpy       (struct wsp_ggml_cgraph * src, struct wsp_ggml_cgraph * dst);
+    WSP_GGML_API void                 wsp_ggml_graph_reset     (struct wsp_ggml_cgraph * cgraph); // set regular grads + optimizer momenta to 0, set loss grad to 1
+    WSP_GGML_API void                 wsp_ggml_graph_clear     (struct wsp_ggml_cgraph * cgraph);
-    WSP_GGML_API void wsp_ggml_build_forward_expand (struct wsp_ggml_cgraph * cgraph, struct wsp_ggml_tensor * tensor);
-    WSP_GGML_API void wsp_ggml_build_backward_expand(struct wsp_ggml_context * ctx, struct wsp_ggml_cgraph * gf, struct wsp_ggml_cgraph * gb, bool keep);
+    WSP_GGML_API int                   wsp_ggml_graph_size   (struct wsp_ggml_cgraph * cgraph);
+    WSP_GGML_API struct wsp_ggml_tensor *  wsp_ggml_graph_node   (struct wsp_ggml_cgraph * cgraph, int i); // if i < 0, returns nodes[n_nodes + i]
+    WSP_GGML_API struct wsp_ggml_tensor ** wsp_ggml_graph_nodes  (struct wsp_ggml_cgraph * cgraph);
+    WSP_GGML_API int                   wsp_ggml_graph_n_nodes(struct wsp_ggml_cgraph * cgraph);
-    // graph allocation in a context
-    WSP_GGML_API struct wsp_ggml_cgraph * wsp_ggml_new_graph         (struct wsp_ggml_context * ctx); // size = WSP_GGML_DEFAULT_GRAPH_SIZE, grads = false
-    WSP_GGML_API struct wsp_ggml_cgraph * wsp_ggml_new_graph_custom  (struct wsp_ggml_context * ctx, size_t size, bool grads);
-    WSP_GGML_API struct wsp_ggml_cgraph * wsp_ggml_graph_dup         (struct wsp_ggml_context * ctx, struct wsp_ggml_cgraph * cgraph);
-    WSP_GGML_API struct wsp_ggml_cgraph   wsp_ggml_graph_view        (struct wsp_ggml_cgraph * cgraph, int i0, int i1);
-    WSP_GGML_API void                 wsp_ggml_graph_cpy         (struct wsp_ggml_cgraph * src, struct wsp_ggml_cgraph * dst);
-    WSP_GGML_API void                 wsp_ggml_graph_reset       (struct wsp_ggml_cgraph * cgraph);  // zero grads
-    WSP_GGML_API void                 wsp_ggml_graph_clear       (struct wsp_ggml_cgraph * cgraph);
+    WSP_GGML_API void   wsp_ggml_graph_add_node(struct wsp_ggml_cgraph * cgraph, struct wsp_ggml_tensor * tensor);
     WSP_GGML_API size_t wsp_ggml_graph_overhead(void);
     WSP_GGML_API size_t wsp_ggml_graph_overhead_custom(size_t size, bool grads);
-    // wsp_ggml_graph_plan() has to be called before wsp_ggml_graph_compute()
-    // when plan.work_size > 0, caller must allocate memory for plan.work_data
-    WSP_GGML_API struct wsp_ggml_cplan wsp_ggml_graph_plan   (const struct wsp_ggml_cgraph * cgraph, int n_threads /*= WSP_GGML_DEFAULT_N_THREADS*/);
-    WSP_GGML_API int               wsp_ggml_graph_compute(      struct wsp_ggml_cgraph * cgraph, struct wsp_ggml_cplan * cplan);
-    // same as wsp_ggml_graph_compute() but the work data is allocated as a part of the context
-    // note: the drawback of this API is that you must have ensured that the context has enough memory for the work data
-    WSP_GGML_API void wsp_ggml_graph_compute_with_ctx(struct wsp_ggml_context * ctx, struct wsp_ggml_cgraph * cgraph, int n_threads);
-    WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_graph_get_tensor(struct wsp_ggml_cgraph * cgraph, const char * name);
-    WSP_GGML_API void                 wsp_ggml_graph_export(const struct wsp_ggml_cgraph * cgraph, const char * fname);
-    WSP_GGML_API struct wsp_ggml_cgraph * wsp_ggml_graph_import(const char * fname, struct wsp_ggml_context ** ctx_data, struct wsp_ggml_context ** ctx_eval);
+    WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_graph_get_tensor  (const struct wsp_ggml_cgraph * cgraph, const char * name);
+    WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_graph_get_grad    (const struct wsp_ggml_cgraph * cgraph, const struct wsp_ggml_tensor * node);
+    WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_graph_get_grad_acc(const struct wsp_ggml_cgraph * cgraph, const struct wsp_ggml_tensor * node);
     // print info and performance information for the graph
     WSP_GGML_API void wsp_ggml_graph_print(const struct wsp_ggml_cgraph * cgraph);
@@ -1875,191 +2113,14 @@ extern "C" {
     // dump the graph into a file using the dot format
     WSP_GGML_API void wsp_ggml_graph_dump_dot(const struct wsp_ggml_cgraph * gb, const struct wsp_ggml_cgraph * gf, const char * filename);
-    // build gradient checkpointing backward graph gb for gf using provided checkpoints
-    // gb_tmp will contain original backward graph with rewritten backward process nodes,
-    // but without the second forward pass nodes.
-    WSP_GGML_API void wsp_ggml_build_backward_gradient_checkpointing(
-            struct wsp_ggml_context   * ctx,
-            struct wsp_ggml_cgraph    * gf,
-            struct wsp_ggml_cgraph    * gb,
-            struct wsp_ggml_cgraph    * gb_tmp,
-            struct wsp_ggml_tensor  * * checkpoints,
-            int                     n_checkpoints);
-    //
-    // optimization
-    //
-    // optimization methods
-    enum wsp_ggml_opt_type {
-        WSP_GGML_OPT_ADAM,
-        WSP_GGML_OPT_LBFGS,
-    };
-    // linesearch methods
-    enum wsp_ggml_linesearch {
-        WSP_GGML_LINESEARCH_DEFAULT = 1,
-        WSP_GGML_LINESEARCH_BACKTRACKING_ARMIJO       = 0,
-        WSP_GGML_LINESEARCH_BACKTRACKING_WOLFE        = 1,
-        WSP_GGML_LINESEARCH_BACKTRACKING_STRONG_WOLFE = 2,
-    };
-    // optimization return values
-    enum wsp_ggml_opt_result {
-        WSP_GGML_OPT_OK = 0,
-        WSP_GGML_OPT_DID_NOT_CONVERGE,
-        WSP_GGML_OPT_NO_CONTEXT,
-        WSP_GGML_OPT_INVALID_WOLFE,
-        WSP_GGML_OPT_FAIL,
-        WSP_GGML_OPT_CANCEL,
-        WSP_GGML_LINESEARCH_FAIL = -128,
-        WSP_GGML_LINESEARCH_MINIMUM_STEP,
-        WSP_GGML_LINESEARCH_MAXIMUM_STEP,
-        WSP_GGML_LINESEARCH_MAXIMUM_ITERATIONS,
-        WSP_GGML_LINESEARCH_INVALID_PARAMETERS,
-    };
-    typedef void (*wsp_ggml_opt_callback)(void * data, int accum_step, float * sched, bool * cancel);
+    // TODO these functions were sandwiched in the old optimization interface, is there a better place for them?
     typedef void (*wsp_ggml_log_callback)(enum wsp_ggml_log_level level, const char * text, void * user_data);
-    // optimization parameters
-    //
-    //   see ggml.c (wsp_ggml_opt_default_params) for default values
-    //
-    struct wsp_ggml_opt_params {
-        enum wsp_ggml_opt_type type;
-        size_t graph_size;
-        int n_threads;
-        // delta-based convergence test
-        //
-        //   if past == 0 - disabled
-        //   if past > 0:
-        //     stop if |f(x) - f(x_past)| < delta * max(1, |f(x)|)
-        //
-        int past;
-        float delta;
-        // maximum number of iterations without improvement
-        //
-        //   if 0 - disabled
-        //   if > 0:
-        //     assume convergence if no cost improvement in this number of iterations
-        //
-        int max_no_improvement;
-        bool print_forward_graph;
-        bool print_backward_graph;
-        int n_gradient_accumulation;
-        // ADAM parameters
-        struct {
-            int n_iter;
-            float sched; // schedule multiplier (fixed, decay or warmup)
-            float decay; // weight decay for AdamW, use 0.0f to disable
-            int   decay_min_ndim; // minimum number of tensor dimension to apply weight decay
-            float alpha; // learning rate
-            float beta1;
-            float beta2;
-            float eps;   // epsilon for numerical stability
-            float eps_f; // epsilon for convergence test
-            float eps_g; // epsilon for convergence test
-            float gclip; // gradient clipping
-        } adam;
-        // LBFGS parameters
-        struct {
-            int m; // number of corrections to approximate the inv. Hessian
-            int n_iter;
-            int max_linesearch;
-            float eps;      // convergence tolerance
-            float ftol;     // line search tolerance
-            float wolfe;
-            float min_step;
-            float max_step;
-            enum wsp_ggml_linesearch linesearch;
-        } lbfgs;
-    };
-    struct wsp_ggml_opt_context {
-        struct wsp_ggml_context * ctx;
-        struct wsp_ggml_opt_params params;
-        int iter;
-        int64_t nx; // number of parameter elements
-        bool just_initialized;
-        float loss_before;
-        float loss_after;
-        struct {
-            struct wsp_ggml_tensor * g;  // current gradient
-            struct wsp_ggml_tensor * m;  // first moment
-            struct wsp_ggml_tensor * v;  // second moment
-            struct wsp_ggml_tensor * pf; // past function values
-            float fx_best;
-            float fx_prev;
-            int n_no_improvement;
-        } adam;
-        struct {
-            struct wsp_ggml_tensor * x;    // current parameters
-            struct wsp_ggml_tensor * xp;   // previous parameters
-            struct wsp_ggml_tensor * g;    // current gradient
-            struct wsp_ggml_tensor * gp;   // previous gradient
-            struct wsp_ggml_tensor * d;    // search direction
-            struct wsp_ggml_tensor * pf;   // past function values
-            struct wsp_ggml_tensor * lmal; // the L-BFGS memory alpha
-            struct wsp_ggml_tensor * lmys; // the L-BFGS memory ys
-            struct wsp_ggml_tensor * lms;  // the L-BFGS memory s
-            struct wsp_ggml_tensor * lmy;  // the L-BFGS memory y
-            float fx_best;
-            float step;
-            int j;
-            int k;
-            int end;
-            int n_no_improvement;
-        } lbfgs;
-    };
-    WSP_GGML_API struct wsp_ggml_opt_params wsp_ggml_opt_default_params(enum wsp_ggml_opt_type type);
+    // Set callback for all future logging events.
+    // If this is not called, or NULL is supplied, everything is output on stderr.
+    WSP_GGML_API void wsp_ggml_log_set(wsp_ggml_log_callback log_callback, void * user_data);
-    // optimize the function defined by the tensor f
-    WSP_GGML_API enum wsp_ggml_opt_result wsp_ggml_opt(
-            struct wsp_ggml_context * ctx,
-            struct wsp_ggml_opt_params params,
-            struct wsp_ggml_tensor * f);
-    // initialize optimizer context
-    WSP_GGML_API void wsp_ggml_opt_init(
-            struct wsp_ggml_context     * ctx,
-            struct wsp_ggml_opt_context * opt,
-            struct wsp_ggml_opt_params    params,
-            int64_t                   nx);
-    // continue optimizing the function defined by the tensor f
-    WSP_GGML_API enum wsp_ggml_opt_result wsp_ggml_opt_resume(
-            struct wsp_ggml_context * ctx,
-            struct wsp_ggml_opt_context * opt,
-            struct wsp_ggml_tensor * f);
-    // continue optimizing the function defined by the tensor f
-    WSP_GGML_API enum wsp_ggml_opt_result wsp_ggml_opt_resume_g(
-            struct wsp_ggml_context * ctx,
-            struct wsp_ggml_opt_context * opt,
-            struct wsp_ggml_tensor * f,
-            struct wsp_ggml_cgraph * gf,
-            struct wsp_ggml_cgraph * gb,
-            wsp_ggml_opt_callback callback,
-            void * callback_data);
+    WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_set_zero(struct wsp_ggml_tensor * tensor);
     //
     // quantization
@@ -2077,201 +2138,83 @@ extern "C" {
     WSP_GGML_API void wsp_ggml_wsp_quantize_init(enum wsp_ggml_type type);
     WSP_GGML_API void wsp_ggml_wsp_quantize_free(void);
-    // TODO: these would probably get removed in favor of the more general wsp_ggml_wsp_quantize_chunk
-    WSP_GGML_API size_t wsp_ggml_wsp_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist);
-    WSP_GGML_API size_t wsp_ggml_wsp_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist);
-    WSP_GGML_API size_t wsp_ggml_wsp_quantize_q5_0(const float * src, void * dst, int n, int k, int64_t * hist);
-    WSP_GGML_API size_t wsp_ggml_wsp_quantize_q5_1(const float * src, void * dst, int n, int k, int64_t * hist);
-    WSP_GGML_API size_t wsp_ggml_wsp_quantize_q8_0(const float * src, void * dst, int n, int k, int64_t * hist);
-    WSP_GGML_API size_t wsp_ggml_wsp_quantize_q2_K(const float * src, void * dst, int n, int k, int64_t * hist);
-    WSP_GGML_API size_t wsp_ggml_wsp_quantize_q3_K(const float * src, void * dst, int n, int k, int64_t * hist);
-    WSP_GGML_API size_t wsp_ggml_wsp_quantize_q4_K(const float * src, void * dst, int n, int k, int64_t * hist);
-    WSP_GGML_API size_t wsp_ggml_wsp_quantize_q5_K(const float * src, void * dst, int n, int k, int64_t * hist);
-    WSP_GGML_API size_t wsp_ggml_wsp_quantize_q6_K(const float * src, void * dst, int n, int k, int64_t * hist);
     // some quantization type cannot be used without an importance matrix
     WSP_GGML_API bool wsp_ggml_wsp_quantize_requires_imatrix(enum wsp_ggml_type type);
     // calls wsp_ggml_wsp_quantize_init internally (i.e. can allocate memory)
-    WSP_GGML_API size_t wsp_ggml_wsp_quantize_chunk(enum wsp_ggml_type type, const float * src, void * dst,
-            int start, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
-    //
-    // gguf
-    //
-    enum wsp_gguf_type {
-        WSP_GGUF_TYPE_UINT8   = 0,
-        WSP_GGUF_TYPE_INT8    = 1,
-        WSP_GGUF_TYPE_UINT16  = 2,
-        WSP_GGUF_TYPE_INT16   = 3,
-        WSP_GGUF_TYPE_UINT32  = 4,
-        WSP_GGUF_TYPE_INT32   = 5,
-        WSP_GGUF_TYPE_FLOAT32 = 6,
-        WSP_GGUF_TYPE_BOOL    = 7,
-        WSP_GGUF_TYPE_STRING  = 8,
-        WSP_GGUF_TYPE_ARRAY   = 9,
-        WSP_GGUF_TYPE_UINT64  = 10,
-        WSP_GGUF_TYPE_INT64   = 11,
-        WSP_GGUF_TYPE_FLOAT64 = 12,
-        WSP_GGUF_TYPE_COUNT,       // marks the end of the enum
+    WSP_GGML_API size_t wsp_ggml_wsp_quantize_chunk(
+            enum wsp_ggml_type   type,
+               const float * src,
+                      void * dst,
+                   int64_t   start,
+                   int64_t   nrows,
+                   int64_t   n_per_row,
+               const float * imatrix);
+#ifdef __cplusplus
+    // restrict not standard in C++
+#    if defined(__GNUC__)
+#        define WSP_GGML_RESTRICT __restrict__
+#    elif defined(__clang__)
+#        define WSP_GGML_RESTRICT __restrict
+#    elif defined(_MSC_VER)
+#        define WSP_GGML_RESTRICT __restrict
+#    else
+#        define WSP_GGML_RESTRICT
+#    endif
+#else
+#    if defined (_MSC_VER) && (__STDC_VERSION__ < 201112L)
+#        define WSP_GGML_RESTRICT __restrict
+#    else
+#        define WSP_GGML_RESTRICT restrict
+#    endif
+#endif
+    typedef void (*wsp_ggml_to_float_t)  (const void  * WSP_GGML_RESTRICT x, float * WSP_GGML_RESTRICT y, int64_t k);
+    typedef void (*wsp_ggml_from_float_t)(const float * WSP_GGML_RESTRICT x, void  * WSP_GGML_RESTRICT y, int64_t k);
+    struct wsp_ggml_type_traits {
+        const char             * type_name;
+        int64_t                  blck_size;
+        int64_t                  blck_size_interleave; // interleave elements in blocks
+        size_t                   type_size;
+        bool                     is_quantized;
+        wsp_ggml_to_float_t          to_float;
+        wsp_ggml_from_float_t        from_float_ref;
     };
-    struct wsp_gguf_context;
+    WSP_GGML_API const struct wsp_ggml_type_traits * wsp_ggml_get_type_traits(enum wsp_ggml_type type);
-    struct wsp_gguf_init_params {
-        bool no_alloc;
+    // ggml threadpool
+    // TODO: currently, only a few functions are in the base ggml API, while the rest are in the CPU backend
+    // the goal should be to create an API that other backends can use move everything to the ggml base
-        // if not NULL, create a wsp_ggml_context and allocate the tensor data in it
-        struct wsp_ggml_context ** ctx;
+    // scheduling priorities
+    enum wsp_ggml_sched_priority {
+        WSP_GGML_SCHED_PRIO_LOW = -1,
+        WSP_GGML_SCHED_PRIO_NORMAL,
+        WSP_GGML_SCHED_PRIO_MEDIUM,
+        WSP_GGML_SCHED_PRIO_HIGH,
+        WSP_GGML_SCHED_PRIO_REALTIME
     };
-    WSP_GGML_API struct wsp_gguf_context * wsp_gguf_init_empty(void);
-    WSP_GGML_API struct wsp_gguf_context * wsp_gguf_init_from_file(const char * fname, struct wsp_gguf_init_params params);
-    //WSP_GGML_API struct wsp_gguf_context * wsp_gguf_init_from_buffer(..);
-    WSP_GGML_API void wsp_gguf_free(struct wsp_gguf_context * ctx);
-    WSP_GGML_API const char * wsp_gguf_type_name(enum wsp_gguf_type type);
-    WSP_GGML_API int    wsp_gguf_get_version    (const struct wsp_gguf_context * ctx);
-    WSP_GGML_API size_t wsp_gguf_get_alignment  (const struct wsp_gguf_context * ctx);
-    WSP_GGML_API size_t wsp_gguf_get_data_offset(const struct wsp_gguf_context * ctx);
-    WSP_GGML_API void * wsp_gguf_get_data       (const struct wsp_gguf_context * ctx);
-    WSP_GGML_API int          wsp_gguf_get_n_kv(const struct wsp_gguf_context * ctx);
-    WSP_GGML_API int          wsp_gguf_find_key(const struct wsp_gguf_context * ctx, const char * key);
-    WSP_GGML_API const char * wsp_gguf_get_key (const struct wsp_gguf_context * ctx, int key_id);
-    WSP_GGML_API enum wsp_gguf_type wsp_gguf_get_kv_type (const struct wsp_gguf_context * ctx, int key_id);
-    WSP_GGML_API enum wsp_gguf_type wsp_gguf_get_arr_type(const struct wsp_gguf_context * ctx, int key_id);
-    // will abort if the wrong type is used for the key
-    WSP_GGML_API uint8_t      wsp_gguf_get_val_u8  (const struct wsp_gguf_context * ctx, int key_id);
-    WSP_GGML_API int8_t       wsp_gguf_get_val_i8  (const struct wsp_gguf_context * ctx, int key_id);
-    WSP_GGML_API uint16_t     wsp_gguf_get_val_u16 (const struct wsp_gguf_context * ctx, int key_id);
-    WSP_GGML_API int16_t      wsp_gguf_get_val_i16 (const struct wsp_gguf_context * ctx, int key_id);
-    WSP_GGML_API uint32_t     wsp_gguf_get_val_u32 (const struct wsp_gguf_context * ctx, int key_id);
-    WSP_GGML_API int32_t      wsp_gguf_get_val_i32 (const struct wsp_gguf_context * ctx, int key_id);
-    WSP_GGML_API float        wsp_gguf_get_val_f32 (const struct wsp_gguf_context * ctx, int key_id);
-    WSP_GGML_API uint64_t     wsp_gguf_get_val_u64 (const struct wsp_gguf_context * ctx, int key_id);
-    WSP_GGML_API int64_t      wsp_gguf_get_val_i64 (const struct wsp_gguf_context * ctx, int key_id);
-    WSP_GGML_API double       wsp_gguf_get_val_f64 (const struct wsp_gguf_context * ctx, int key_id);
-    WSP_GGML_API bool         wsp_gguf_get_val_bool(const struct wsp_gguf_context * ctx, int key_id);
-    WSP_GGML_API const char * wsp_gguf_get_val_str (const struct wsp_gguf_context * ctx, int key_id);
-    WSP_GGML_API const void * wsp_gguf_get_val_data(const struct wsp_gguf_context * ctx, int key_id);
-    WSP_GGML_API int          wsp_gguf_get_arr_n   (const struct wsp_gguf_context * ctx, int key_id);
-    WSP_GGML_API const void * wsp_gguf_get_arr_data(const struct wsp_gguf_context * ctx, int key_id);
-    WSP_GGML_API const char * wsp_gguf_get_arr_str (const struct wsp_gguf_context * ctx, int key_id, int i);
-    WSP_GGML_API int            wsp_gguf_get_n_tensors    (const struct wsp_gguf_context * ctx);
-    WSP_GGML_API int            wsp_gguf_find_tensor      (const struct wsp_gguf_context * ctx, const char * name);
-    WSP_GGML_API size_t         wsp_gguf_get_tensor_offset(const struct wsp_gguf_context * ctx, int i);
-    WSP_GGML_API char *         wsp_gguf_get_tensor_name  (const struct wsp_gguf_context * ctx, int i);
-    WSP_GGML_API enum wsp_ggml_type wsp_gguf_get_tensor_type  (const struct wsp_gguf_context * ctx, int i);
-    // overrides existing values or adds a new one
-    WSP_GGML_API void wsp_gguf_set_val_u8  (struct wsp_gguf_context * ctx, const char * key, uint8_t  val);
-    WSP_GGML_API void wsp_gguf_set_val_i8  (struct wsp_gguf_context * ctx, const char * key, int8_t   val);
-    WSP_GGML_API void wsp_gguf_set_val_u16 (struct wsp_gguf_context * ctx, const char * key, uint16_t val);
-    WSP_GGML_API void wsp_gguf_set_val_i16 (struct wsp_gguf_context * ctx, const char * key, int16_t  val);
-    WSP_GGML_API void wsp_gguf_set_val_u32 (struct wsp_gguf_context * ctx, const char * key, uint32_t val);
-    WSP_GGML_API void wsp_gguf_set_val_i32 (struct wsp_gguf_context * ctx, const char * key, int32_t  val);
-    WSP_GGML_API void wsp_gguf_set_val_f32 (struct wsp_gguf_context * ctx, const char * key, float    val);
-    WSP_GGML_API void wsp_gguf_set_val_u64 (struct wsp_gguf_context * ctx, const char * key, uint64_t val);
-    WSP_GGML_API void wsp_gguf_set_val_i64 (struct wsp_gguf_context * ctx, const char * key, int64_t  val);
-    WSP_GGML_API void wsp_gguf_set_val_f64 (struct wsp_gguf_context * ctx, const char * key, double   val);
-    WSP_GGML_API void wsp_gguf_set_val_bool(struct wsp_gguf_context * ctx, const char * key, bool     val);
-    WSP_GGML_API void wsp_gguf_set_val_str (struct wsp_gguf_context * ctx, const char * key, const char * val);
-    WSP_GGML_API void wsp_gguf_set_arr_data(struct wsp_gguf_context * ctx, const char * key, enum wsp_gguf_type type, const void * data, int n);
-    WSP_GGML_API void wsp_gguf_set_arr_str (struct wsp_gguf_context * ctx, const char * key, const char ** data, int n);
-    // set or add KV pairs from another context
-    WSP_GGML_API void wsp_gguf_set_kv(struct wsp_gguf_context * ctx, struct wsp_gguf_context * src);
-    // manage tensor info
-    WSP_GGML_API void wsp_gguf_add_tensor(struct wsp_gguf_context * ctx, const struct wsp_ggml_tensor * tensor);
-    WSP_GGML_API void wsp_gguf_set_tensor_type(struct wsp_gguf_context * ctx, const char * name, enum wsp_ggml_type type);
-    WSP_GGML_API void wsp_gguf_set_tensor_data(struct wsp_gguf_context * ctx, const char * name, const void * data, size_t size);
-    // writing gguf files can be done in 2 ways:
-    //
-    // - write the entire wsp_gguf_context to a binary file in a single pass:
-    //
-    //   wsp_gguf_write_to_file(ctx, fname);
-    //
-    // - first prepare a file with a placeholder for the meta data, write the tensor data, then write the meta data:
-    //
-    //   FILE * f = fopen(fname, "wb");
-    //   fseek(f, wsp_gguf_get_meta_size(ctx), SEEK_SET);
-    //   fwrite(f, ...);
-    //   void * data = wsp_gguf_meta_get_meta_data(ctx);
-    //   fseek(f, 0, SEEK_SET);
-    //   fwrite(f, data, wsp_gguf_get_meta_size(ctx));
-    //   free(data);
-    //   fclose(f);
-    //
-    // write the entire context to a binary file
-    WSP_GGML_API void wsp_gguf_write_to_file(const struct wsp_gguf_context * ctx, const char * fname, bool only_meta);
-    // get the size in bytes of the meta data (header, kv pairs, tensor info) including padding
-    WSP_GGML_API size_t wsp_gguf_get_meta_size(const struct wsp_gguf_context * ctx);
-    WSP_GGML_API void   wsp_gguf_get_meta_data(const struct wsp_gguf_context * ctx, void * data);
+    // threadpool params
+    // Use wsp_ggml_threadpool_params_default() or wsp_ggml_threadpool_params_init() to populate the defaults
+    struct wsp_ggml_threadpool_params {
+        bool                cpumask[WSP_GGML_MAX_N_THREADS]; // mask of cpu cores (all-zeros means use default affinity settings)
+        int                 n_threads;                   // number of threads
+        enum wsp_ggml_sched_priority prio;                   // thread priority
+        uint32_t            poll;                        // polling level (0 - no polling, 100 - aggressive polling)
+        bool                strict_cpu;                  // strict cpu placement
+        bool                paused;                      // start in paused state
+    };
-    //
-    // system info
-    //
+    struct wsp_ggml_threadpool;     // forward declaration, see ggml.c
-    WSP_GGML_API int wsp_ggml_cpu_has_avx        (void);
-    WSP_GGML_API int wsp_ggml_cpu_has_avx_vnni   (void);
-    WSP_GGML_API int wsp_ggml_cpu_has_avx2       (void);
-    WSP_GGML_API int wsp_ggml_cpu_has_avx512     (void);
-    WSP_GGML_API int wsp_ggml_cpu_has_avx512_vbmi(void);
-    WSP_GGML_API int wsp_ggml_cpu_has_avx512_vnni(void);
-    WSP_GGML_API int wsp_ggml_cpu_has_fma        (void);
-    WSP_GGML_API int wsp_ggml_cpu_has_neon       (void);
-    WSP_GGML_API int wsp_ggml_cpu_has_arm_fma    (void);
-    WSP_GGML_API int wsp_ggml_cpu_has_metal      (void);
-    WSP_GGML_API int wsp_ggml_cpu_has_f16c       (void);
-    WSP_GGML_API int wsp_ggml_cpu_has_fp16_va    (void);
-    WSP_GGML_API int wsp_ggml_cpu_has_wasm_simd  (void);
-    WSP_GGML_API int wsp_ggml_cpu_has_blas       (void);
-    WSP_GGML_API int wsp_ggml_cpu_has_cublas     (void);
-    WSP_GGML_API int wsp_ggml_cpu_has_clblast    (void);
-    WSP_GGML_API int wsp_ggml_cpu_has_gpublas    (void);
-    WSP_GGML_API int wsp_ggml_cpu_has_sse3       (void);
-    WSP_GGML_API int wsp_ggml_cpu_has_ssse3      (void);
-    WSP_GGML_API int wsp_ggml_cpu_has_vsx        (void);
+    typedef struct wsp_ggml_threadpool * wsp_ggml_threadpool_t;
-    //
-    // Internal types and functions exposed for tests and benchmarks
-    //
-#ifdef  __cplusplus
-// restrict not standard in C++
-#define WSP_GGML_RESTRICT
-#else
-#define WSP_GGML_RESTRICT restrict
-#endif
-    typedef void (*wsp_ggml_to_float_t)  (const void  * WSP_GGML_RESTRICT x, float * WSP_GGML_RESTRICT y, int k);
-    typedef void (*wsp_ggml_from_float_t)(const float * WSP_GGML_RESTRICT x, void  * WSP_GGML_RESTRICT y, int k);
-    typedef void (*wsp_ggml_vec_dot_t)   (const int n, float * WSP_GGML_RESTRICT s, const void * WSP_GGML_RESTRICT x, const void * WSP_GGML_RESTRICT y);
-    typedef struct {
-        const char      * type_name;
-        int               blck_size;
-        size_t            type_size;
-        bool              is_quantized;
-        wsp_ggml_to_float_t   to_float;
-        wsp_ggml_from_float_t from_float;
-        wsp_ggml_from_float_t from_float_reference;
-        wsp_ggml_vec_dot_t    vec_dot;
-        enum wsp_ggml_type    vec_dot_type;
-    } wsp_ggml_type_traits_t;
-    WSP_GGML_API wsp_ggml_type_traits_t wsp_ggml_internal_get_type_traits(enum wsp_ggml_type type);
+    WSP_GGML_API struct wsp_ggml_threadpool_params wsp_ggml_threadpool_params_default(int n_threads);
+    WSP_GGML_API void                          wsp_ggml_threadpool_params_init   (struct wsp_ggml_threadpool_params * p, int n_threads);
+    WSP_GGML_API bool                          wsp_ggml_threadpool_params_match  (const struct wsp_ggml_threadpool_params * p0, const struct wsp_ggml_threadpool_params * p1);
 #ifdef  __cplusplus
 }