npm - whisper.rn - Versions diffs - 0.4.0-rc.1 → 0.4.0-rc.11 - Mend

whisper.rn 0.4.0-rc.1 → 0.4.0-rc.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (78) hide show

package/README.md +6 -6
package/android/build.gradle +4 -0
package/android/src/main/CMakeLists.txt +21 -1
package/android/src/main/java/com/rnwhisper/AudioUtils.java +27 -92
package/android/src/main/java/com/rnwhisper/RNWhisper.java +86 -40
package/android/src/main/java/com/rnwhisper/WhisperContext.java +85 -131
package/android/src/main/jni-utils.h +76 -0
package/android/src/main/jni.cpp +226 -109
package/android/src/newarch/java/com/rnwhisper/RNWhisperModule.java +10 -0
package/android/src/oldarch/java/com/rnwhisper/RNWhisperModule.java +10 -0
package/cpp/coreml/whisper-encoder-impl.h +1 -1
package/cpp/coreml/whisper-encoder.h +4 -0
package/cpp/coreml/whisper-encoder.mm +5 -3
package/cpp/ggml-alloc.c +797 -400
package/cpp/ggml-alloc.h +60 -10
package/cpp/ggml-backend-impl.h +255 -0
package/cpp/ggml-backend-reg.cpp +582 -0
package/cpp/ggml-backend.cpp +2002 -0
package/cpp/ggml-backend.h +354 -0
package/cpp/ggml-common.h +1851 -0
package/cpp/ggml-cpp.h +39 -0
package/cpp/ggml-cpu-aarch64.cpp +4247 -0
package/cpp/ggml-cpu-aarch64.h +8 -0
package/cpp/ggml-cpu-impl.h +531 -0
package/cpp/ggml-cpu-quants.c +12245 -0
package/cpp/ggml-cpu-quants.h +63 -0
package/cpp/ggml-cpu-traits.cpp +36 -0
package/cpp/ggml-cpu-traits.h +38 -0
package/cpp/ggml-cpu.c +14792 -0
package/cpp/ggml-cpu.cpp +653 -0
package/cpp/ggml-cpu.h +137 -0
package/cpp/ggml-impl.h +567 -0
package/cpp/ggml-metal-impl.h +288 -0
package/cpp/ggml-metal.h +24 -43
package/cpp/ggml-metal.m +4867 -1080
package/cpp/ggml-opt.cpp +854 -0
package/cpp/ggml-opt.h +216 -0
package/cpp/ggml-quants.c +5238 -0
package/cpp/ggml-quants.h +100 -0
package/cpp/ggml-threading.cpp +12 -0
package/cpp/ggml-threading.h +14 -0
package/cpp/ggml-whisper.metallib +0 -0
package/cpp/ggml.c +5106 -19431
package/cpp/ggml.h +847 -669
package/cpp/gguf.cpp +1329 -0
package/cpp/gguf.h +202 -0
package/cpp/rn-audioutils.cpp +68 -0
package/cpp/rn-audioutils.h +14 -0
package/cpp/rn-whisper-log.h +11 -0
package/cpp/rn-whisper.cpp +221 -52
package/cpp/rn-whisper.h +50 -15
package/cpp/whisper.cpp +3174 -1533
package/cpp/whisper.h +176 -44
package/ios/RNWhisper.mm +139 -46
package/ios/RNWhisperAudioUtils.h +1 -2
package/ios/RNWhisperAudioUtils.m +18 -67
package/ios/RNWhisperContext.h +11 -8
package/ios/RNWhisperContext.mm +195 -150
package/jest/mock.js +15 -2
package/lib/commonjs/NativeRNWhisper.js.map +1 -1
package/lib/commonjs/index.js +76 -28
package/lib/commonjs/index.js.map +1 -1
package/lib/commonjs/version.json +1 -1
package/lib/module/NativeRNWhisper.js.map +1 -1
package/lib/module/index.js +76 -28
package/lib/module/index.js.map +1 -1
package/lib/module/version.json +1 -1
package/lib/typescript/NativeRNWhisper.d.ts +13 -4
package/lib/typescript/NativeRNWhisper.d.ts.map +1 -1
package/lib/typescript/index.d.ts +37 -5
package/lib/typescript/index.d.ts.map +1 -1
package/package.json +9 -7
package/src/NativeRNWhisper.ts +20 -4
package/src/index.ts +98 -42
package/src/version.json +1 -1
package/whisper-rn.podspec +13 -20
package/cpp/README.md +0 -4
package/cpp/ggml-metal.metal +0 -2353

package/cpp/ggml.h CHANGED Viewed

@@ -58,7 +58,8 @@
 //   {
 //       ...
 //
-//       struct wsp_ggml_cgraph gf = wsp_ggml_build_forward(f);
+//       struct wsp_ggml_cgraph * gf = wsp_ggml_new_graph(ctx);
+//       wsp_ggml_build_forward_expand(gf, f);
 //
 //       // set the input variable and parameter values
 //       wsp_ggml_set_f32(x, 2.0f);
@@ -175,15 +176,15 @@
 #ifdef WSP_GGML_SHARED
 #    if defined(_WIN32) && !defined(__MINGW32__)
 #        ifdef WSP_GGML_BUILD
-#            define WSP_GGML_API __declspec(dllexport)
+#            define WSP_GGML_API __declspec(dllexport) extern
 #        else
-#            define WSP_GGML_API __declspec(dllimport)
+#            define WSP_GGML_API __declspec(dllimport) extern
 #        endif
 #    else
-#        define WSP_GGML_API __attribute__ ((visibility ("default")))
+#        define WSP_GGML_API __attribute__ ((visibility ("default"))) extern
 #    endif
 #else
-#    define WSP_GGML_API
+#    define WSP_GGML_API extern
 #endif
 // TODO: support for clang
@@ -197,30 +198,35 @@
 #ifndef __GNUC__
 #    define WSP_GGML_ATTRIBUTE_FORMAT(...)
-#elif defined(__MINGW32__)
+#elif defined(__MINGW32__) && !defined(__clang__)
 #    define WSP_GGML_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__)))
 #else
 #    define WSP_GGML_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))
 #endif
-#include <stdint.h>
-#include <stddef.h>
 #include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <stdio.h>
 #define WSP_GGML_FILE_MAGIC   0x67676d6c // "ggml"
-#define WSP_GGML_FILE_VERSION 1
+#define WSP_GGML_FILE_VERSION 2
 #define WSP_GGML_QNT_VERSION        2    // bump this on quantization format changes
 #define WSP_GGML_QNT_VERSION_FACTOR 1000 // do not change this
-#define WSP_GGML_MAX_DIMS          4
-#define WSP_GGML_MAX_NODES         4096
-#define WSP_GGML_MAX_PARAMS        256
-#define WSP_GGML_MAX_CONTEXTS      64
-#define WSP_GGML_MAX_SRC           6
-#define WSP_GGML_MAX_NAME          64
-#define WSP_GGML_MAX_OP_PARAMS     32
-#define WSP_GGML_DEFAULT_N_THREADS 4
+#define WSP_GGML_MAX_DIMS           4
+#define WSP_GGML_MAX_PARAMS         2048
+#define WSP_GGML_MAX_SRC            10
+#define WSP_GGML_MAX_N_THREADS      512
+#define WSP_GGML_MAX_OP_PARAMS      64
+#ifndef WSP_GGML_MAX_NAME
+#   define WSP_GGML_MAX_NAME        64
+#endif
+#define WSP_GGML_DEFAULT_N_THREADS  4
+#define WSP_GGML_DEFAULT_GRAPH_SIZE 2048
 #if UINTPTR_MAX == 0xFFFFFFFF
     #define WSP_GGML_MEM_ALIGN 4
@@ -231,22 +237,34 @@
 #define WSP_GGML_EXIT_SUCCESS 0
 #define WSP_GGML_EXIT_ABORTED 1
-#define GGUF_MAGIC   0x46554747 // "GGUF"
-#define GGUF_VERSION 2
-#define GGUF_DEFAULT_ALIGNMENT 32
+#define WSP_GGML_ROPE_TYPE_NEOX   2
+#define WSP_GGML_ROPE_TYPE_MROPE  8
+#define WSP_GGML_ROPE_TYPE_VISION 24
 #define WSP_GGML_UNUSED(x) (void)(x)
 #define WSP_GGML_PAD(x, n) (((x) + (n) - 1) & ~((n) - 1))
-#define WSP_GGML_ASSERT(x) \
-    do { \
-        if (!(x)) { \
-            fprintf(stderr, "WSP_GGML_ASSERT: %s:%d: %s\n", __FILE__, __LINE__, #x); \
-            abort(); \
-        } \
-    } while (0)
+#ifndef NDEBUG
+#   define WSP_GGML_UNREACHABLE() do { fprintf(stderr, "statement should be unreachable\n"); abort(); } while(0)
+#elif defined(__GNUC__)
+#   define WSP_GGML_UNREACHABLE() __builtin_unreachable()
+#elif defined(_MSC_VER)
+#   define WSP_GGML_UNREACHABLE() __assume(0)
+#else
+#   define WSP_GGML_UNREACHABLE() ((void) 0)
+#endif
+#ifdef __cplusplus
+#   define WSP_GGML_NORETURN [[noreturn]]
+#elif defined(_MSC_VER)
+#   define WSP_GGML_NORETURN __declspec(noreturn)
+#else
+#   define WSP_GGML_NORETURN _Noreturn
+#endif
+#define WSP_GGML_ABORT(...) wsp_ggml_abort(__FILE__, __LINE__, __VA_ARGS__)
+#define WSP_GGML_ASSERT(x) if (!(x)) WSP_GGML_ABORT("WSP_GGML_ASSERT(%s) failed", #x)
 // used to copy the number of elements and stride in bytes of tensors into local variables.
 // main purpose is to reduce code duplication and improve readability.
@@ -272,74 +290,139 @@
     const type prefix##3 = (pointer)->array[3]; \
     WSP_GGML_UNUSED(prefix##3);
+#define WSP_GGML_TENSOR_UNARY_OP_LOCALS \
+    WSP_GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne) \
+    WSP_GGML_TENSOR_LOCALS(size_t,  nb0, src0, nb) \
+    WSP_GGML_TENSOR_LOCALS(int64_t, ne,  dst,  ne) \
+    WSP_GGML_TENSOR_LOCALS(size_t,  nb,  dst,  nb)
+#define WSP_GGML_TENSOR_BINARY_OP_LOCALS \
+    WSP_GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne) \
+    WSP_GGML_TENSOR_LOCALS(size_t,  nb0, src0, nb) \
+    WSP_GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne) \
+    WSP_GGML_TENSOR_LOCALS(size_t,  nb1, src1, nb) \
+    WSP_GGML_TENSOR_LOCALS(int64_t, ne,  dst,  ne) \
+    WSP_GGML_TENSOR_LOCALS(size_t,  nb,  dst,  nb)
+#define WSP_GGML_TENSOR_BINARY_OP_LOCALS01 \
+    WSP_GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne) \
+    WSP_GGML_TENSOR_LOCALS(size_t,  nb0, src0, nb) \
+    WSP_GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne) \
+    WSP_GGML_TENSOR_LOCALS(size_t,  nb1, src1, nb)
 #ifdef  __cplusplus
 extern "C" {
 #endif
-#if defined(__ARM_NEON) && defined(__CUDACC__)
-    typedef half wsp_ggml_fp16_t;
-#elif defined(__ARM_NEON)
-    typedef __fp16 wsp_ggml_fp16_t;
-#else
-    typedef uint16_t wsp_ggml_fp16_t;
-#endif
+    WSP_GGML_NORETURN WSP_GGML_ATTRIBUTE_FORMAT(3, 4)
+    WSP_GGML_API void wsp_ggml_abort(const char * file, int line, const char * fmt, ...);
+    enum wsp_ggml_status {
+        WSP_GGML_STATUS_ALLOC_FAILED = -2,
+        WSP_GGML_STATUS_FAILED = -1,
+        WSP_GGML_STATUS_SUCCESS = 0,
+        WSP_GGML_STATUS_ABORTED = 1,
+    };
-    // convert FP16 <-> FP32
-    WSP_GGML_API float       wsp_ggml_fp16_to_fp32(wsp_ggml_fp16_t x);
-    WSP_GGML_API wsp_ggml_fp16_t wsp_ggml_fp32_to_fp16(float x);
+    // get wsp_ggml_status name string
+    WSP_GGML_API const char * wsp_ggml_status_to_string(enum wsp_ggml_status status);
-    WSP_GGML_API void wsp_ggml_fp16_to_fp32_row(const wsp_ggml_fp16_t * x, float * y, int n);
-    WSP_GGML_API void wsp_ggml_fp32_to_fp16_row(const float * x, wsp_ggml_fp16_t * y, int n);
+    // ieee 754-2008 half-precision float16
+    // todo: make this not an integral type
+    typedef uint16_t wsp_ggml_fp16_t;
+    WSP_GGML_API float       wsp_ggml_fp16_to_fp32(wsp_ggml_fp16_t);
+    WSP_GGML_API wsp_ggml_fp16_t wsp_ggml_fp32_to_fp16(float);
+    WSP_GGML_API void        wsp_ggml_fp16_to_fp32_row(const wsp_ggml_fp16_t *, float *, int64_t);
+    WSP_GGML_API void        wsp_ggml_fp32_to_fp16_row(const float *, wsp_ggml_fp16_t *, int64_t);
+    // google brain half-precision bfloat16
+    typedef struct { uint16_t bits; } wsp_ggml_bf16_t;
+    WSP_GGML_API wsp_ggml_bf16_t wsp_ggml_fp32_to_bf16(float);
+    WSP_GGML_API float       wsp_ggml_bf16_to_fp32(wsp_ggml_bf16_t);  // consider just doing << 16
+    WSP_GGML_API void        wsp_ggml_bf16_to_fp32_row(const wsp_ggml_bf16_t *, float *, int64_t);
+    WSP_GGML_API void        wsp_ggml_fp32_to_bf16_row_ref(const float *, wsp_ggml_bf16_t *, int64_t);
+    WSP_GGML_API void        wsp_ggml_fp32_to_bf16_row(const float *, wsp_ggml_bf16_t *, int64_t);
     struct wsp_ggml_object;
     struct wsp_ggml_context;
+    struct wsp_ggml_cgraph;
+    // NOTE: always add types at the end of the enum to keep backward compatibility
     enum wsp_ggml_type {
-        WSP_GGML_TYPE_F32  = 0,
-        WSP_GGML_TYPE_F16  = 1,
-        WSP_GGML_TYPE_Q4_0 = 2,
-        WSP_GGML_TYPE_Q4_1 = 3,
+        WSP_GGML_TYPE_F32     = 0,
+        WSP_GGML_TYPE_F16     = 1,
+        WSP_GGML_TYPE_Q4_0    = 2,
+        WSP_GGML_TYPE_Q4_1    = 3,
         // WSP_GGML_TYPE_Q4_2 = 4, support has been removed
-        // WSP_GGML_TYPE_Q4_3 (5) support has been removed
-        WSP_GGML_TYPE_Q5_0 = 6,
-        WSP_GGML_TYPE_Q5_1 = 7,
-        WSP_GGML_TYPE_Q8_0 = 8,
-        WSP_GGML_TYPE_Q8_1 = 9,
-        // k-quantizations
-        WSP_GGML_TYPE_Q2_K = 10,
-        WSP_GGML_TYPE_Q3_K = 11,
-        WSP_GGML_TYPE_Q4_K = 12,
-        WSP_GGML_TYPE_Q5_K = 13,
-        WSP_GGML_TYPE_Q6_K = 14,
-        WSP_GGML_TYPE_Q8_K = 15,
-        WSP_GGML_TYPE_I8,
-        WSP_GGML_TYPE_I16,
-        WSP_GGML_TYPE_I32,
-        WSP_GGML_TYPE_COUNT,
+        // WSP_GGML_TYPE_Q4_3 = 5, support has been removed
+        WSP_GGML_TYPE_Q5_0    = 6,
+        WSP_GGML_TYPE_Q5_1    = 7,
+        WSP_GGML_TYPE_Q8_0    = 8,
+        WSP_GGML_TYPE_Q8_1    = 9,
+        WSP_GGML_TYPE_Q2_K    = 10,
+        WSP_GGML_TYPE_Q3_K    = 11,
+        WSP_GGML_TYPE_Q4_K    = 12,
+        WSP_GGML_TYPE_Q5_K    = 13,
+        WSP_GGML_TYPE_Q6_K    = 14,
+        WSP_GGML_TYPE_Q8_K    = 15,
+        WSP_GGML_TYPE_IQ2_XXS = 16,
+        WSP_GGML_TYPE_IQ2_XS  = 17,
+        WSP_GGML_TYPE_IQ3_XXS = 18,
+        WSP_GGML_TYPE_IQ1_S   = 19,
+        WSP_GGML_TYPE_IQ4_NL  = 20,
+        WSP_GGML_TYPE_IQ3_S   = 21,
+        WSP_GGML_TYPE_IQ2_S   = 22,
+        WSP_GGML_TYPE_IQ4_XS  = 23,
+        WSP_GGML_TYPE_I8      = 24,
+        WSP_GGML_TYPE_I16     = 25,
+        WSP_GGML_TYPE_I32     = 26,
+        WSP_GGML_TYPE_I64     = 27,
+        WSP_GGML_TYPE_F64     = 28,
+        WSP_GGML_TYPE_IQ1_M   = 29,
+        WSP_GGML_TYPE_BF16    = 30,
+        // WSP_GGML_TYPE_Q4_0_4_4 = 31, support has been removed from gguf files
+        // WSP_GGML_TYPE_Q4_0_4_8 = 32,
+        // WSP_GGML_TYPE_Q4_0_8_8 = 33,
+        WSP_GGML_TYPE_TQ1_0   = 34,
+        WSP_GGML_TYPE_TQ2_0   = 35,
+        // WSP_GGML_TYPE_IQ4_NL_4_4 = 36,
+        // WSP_GGML_TYPE_IQ4_NL_4_8 = 37,
+        // WSP_GGML_TYPE_IQ4_NL_8_8 = 38,
+        WSP_GGML_TYPE_COUNT   = 39,
     };
-    enum wsp_ggml_backend {
-        WSP_GGML_BACKEND_CPU = 0,
-        WSP_GGML_BACKEND_GPU = 10,
-        WSP_GGML_BACKEND_GPU_SPLIT = 20,
+    // precision
+    enum wsp_ggml_prec {
+        WSP_GGML_PREC_DEFAULT,
+        WSP_GGML_PREC_F32,
     };
     // model file types
     enum wsp_ggml_ftype {
-        WSP_GGML_FTYPE_UNKNOWN     = -1,
-        WSP_GGML_FTYPE_ALL_F32     = 0,
-        WSP_GGML_FTYPE_MOSTLY_F16  = 1,  // except 1d tensors
-        WSP_GGML_FTYPE_MOSTLY_Q4_0 = 2,  // except 1d tensors
-        WSP_GGML_FTYPE_MOSTLY_Q4_1 = 3,  // except 1d tensors
+        WSP_GGML_FTYPE_UNKNOWN        = -1,
+        WSP_GGML_FTYPE_ALL_F32        = 0,
+        WSP_GGML_FTYPE_MOSTLY_F16     = 1,  // except 1d tensors
+        WSP_GGML_FTYPE_MOSTLY_Q4_0    = 2,  // except 1d tensors
+        WSP_GGML_FTYPE_MOSTLY_Q4_1    = 3,  // except 1d tensors
         WSP_GGML_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
-        WSP_GGML_FTYPE_MOSTLY_Q8_0 = 7,  // except 1d tensors
-        WSP_GGML_FTYPE_MOSTLY_Q5_0 = 8,  // except 1d tensors
-        WSP_GGML_FTYPE_MOSTLY_Q5_1 = 9,  // except 1d tensors
-        WSP_GGML_FTYPE_MOSTLY_Q2_K = 10, // except 1d tensors
-        WSP_GGML_FTYPE_MOSTLY_Q3_K = 11, // except 1d tensors
-        WSP_GGML_FTYPE_MOSTLY_Q4_K = 12, // except 1d tensors
-        WSP_GGML_FTYPE_MOSTLY_Q5_K = 13, // except 1d tensors
-        WSP_GGML_FTYPE_MOSTLY_Q6_K = 14, // except 1d tensors
+        WSP_GGML_FTYPE_MOSTLY_Q8_0    = 7,  // except 1d tensors
+        WSP_GGML_FTYPE_MOSTLY_Q5_0    = 8,  // except 1d tensors
+        WSP_GGML_FTYPE_MOSTLY_Q5_1    = 9,  // except 1d tensors
+        WSP_GGML_FTYPE_MOSTLY_Q2_K    = 10, // except 1d tensors
+        WSP_GGML_FTYPE_MOSTLY_Q3_K    = 11, // except 1d tensors
+        WSP_GGML_FTYPE_MOSTLY_Q4_K    = 12, // except 1d tensors
+        WSP_GGML_FTYPE_MOSTLY_Q5_K    = 13, // except 1d tensors
+        WSP_GGML_FTYPE_MOSTLY_Q6_K    = 14, // except 1d tensors
+        WSP_GGML_FTYPE_MOSTLY_IQ2_XXS = 15, // except 1d tensors
+        WSP_GGML_FTYPE_MOSTLY_IQ2_XS  = 16, // except 1d tensors
+        WSP_GGML_FTYPE_MOSTLY_IQ3_XXS = 17, // except 1d tensors
+        WSP_GGML_FTYPE_MOSTLY_IQ1_S   = 18, // except 1d tensors
+        WSP_GGML_FTYPE_MOSTLY_IQ4_NL  = 19, // except 1d tensors
+        WSP_GGML_FTYPE_MOSTLY_IQ3_S   = 20, // except 1d tensors
+        WSP_GGML_FTYPE_MOSTLY_IQ2_S   = 21, // except 1d tensors
+        WSP_GGML_FTYPE_MOSTLY_IQ4_XS  = 22, // except 1d tensors
+        WSP_GGML_FTYPE_MOSTLY_IQ1_M   = 23, // except 1d tensors
+        WSP_GGML_FTYPE_MOSTLY_BF16    = 24, // except 1d tensors
     };
     // available tensor operations:
@@ -356,10 +439,13 @@ extern "C" {
         WSP_GGML_OP_SQR,
         WSP_GGML_OP_SQRT,
         WSP_GGML_OP_LOG,
+        WSP_GGML_OP_SIN,
+        WSP_GGML_OP_COS,
         WSP_GGML_OP_SUM,
         WSP_GGML_OP_SUM_ROWS,
         WSP_GGML_OP_MEAN,
         WSP_GGML_OP_ARGMAX,
+        WSP_GGML_OP_COUNT_EQUAL,
         WSP_GGML_OP_REPEAT,
         WSP_GGML_OP_REPEAT_BACK,
         WSP_GGML_OP_CONCAT,
@@ -370,6 +456,7 @@ extern "C" {
         WSP_GGML_OP_GROUP_NORM,
         WSP_GGML_OP_MUL_MAT,
+        WSP_GGML_OP_MUL_MAT_ID,
         WSP_GGML_OP_OUT_PROD,
         WSP_GGML_OP_SCALE,
@@ -389,23 +476,32 @@ extern "C" {
         WSP_GGML_OP_SOFT_MAX_BACK,
         WSP_GGML_OP_ROPE,
         WSP_GGML_OP_ROPE_BACK,
-        WSP_GGML_OP_ALIBI,
         WSP_GGML_OP_CLAMP,
-        WSP_GGML_OP_CONV_1D,
-        WSP_GGML_OP_CONV_2D,
+        WSP_GGML_OP_CONV_TRANSPOSE_1D,
+        WSP_GGML_OP_IM2COL,
+        WSP_GGML_OP_IM2COL_BACK,
         WSP_GGML_OP_CONV_TRANSPOSE_2D,
         WSP_GGML_OP_POOL_1D,
         WSP_GGML_OP_POOL_2D,
+        WSP_GGML_OP_POOL_2D_BACK,
         WSP_GGML_OP_UPSCALE, // nearest interpolate
-        WSP_GGML_OP_FLASH_ATTN,
-        WSP_GGML_OP_FLASH_FF,
+        WSP_GGML_OP_PAD,
+        WSP_GGML_OP_PAD_REFLECT_1D,
+        WSP_GGML_OP_ARANGE,
+        WSP_GGML_OP_TIMESTEP_EMBEDDING,
+        WSP_GGML_OP_ARGSORT,
+        WSP_GGML_OP_LEAKY_RELU,
+        WSP_GGML_OP_FLASH_ATTN_EXT,
         WSP_GGML_OP_FLASH_ATTN_BACK,
+        WSP_GGML_OP_SSM_CONV,
+        WSP_GGML_OP_SSM_SCAN,
         WSP_GGML_OP_WIN_PART,
         WSP_GGML_OP_WIN_UNPART,
         WSP_GGML_OP_GET_REL_POS,
         WSP_GGML_OP_ADD_REL_POS,
+        WSP_GGML_OP_RWKV_WKV6,
+        WSP_GGML_OP_GATED_LINEAR_ATTN,
         WSP_GGML_OP_UNARY,
@@ -422,6 +518,7 @@ extern "C" {
         WSP_GGML_OP_CROSS_ENTROPY_LOSS,
         WSP_GGML_OP_CROSS_ENTROPY_LOSS_BACK,
+        WSP_GGML_OP_OPT_STEP_ADAMW,
         WSP_GGML_OP_COUNT,
     };
@@ -434,41 +531,57 @@ extern "C" {
         WSP_GGML_UNARY_OP_TANH,
         WSP_GGML_UNARY_OP_ELU,
         WSP_GGML_UNARY_OP_RELU,
+        WSP_GGML_UNARY_OP_SIGMOID,
         WSP_GGML_UNARY_OP_GELU,
         WSP_GGML_UNARY_OP_GELU_QUICK,
         WSP_GGML_UNARY_OP_SILU,
+        WSP_GGML_UNARY_OP_HARDSWISH,
+        WSP_GGML_UNARY_OP_HARDSIGMOID,
+        WSP_GGML_UNARY_OP_EXP,
+        WSP_GGML_UNARY_OP_COUNT,
     };
     enum wsp_ggml_object_type {
-        WSP_GGML_OBJECT_TENSOR,
-        WSP_GGML_OBJECT_GRAPH,
-        WSP_GGML_OBJECT_WORK_BUFFER
+        WSP_GGML_OBJECT_TYPE_TENSOR,
+        WSP_GGML_OBJECT_TYPE_GRAPH,
+        WSP_GGML_OBJECT_TYPE_WORK_BUFFER
     };
-    // ggml object
-    struct wsp_ggml_object {
-        size_t offs;
-        size_t size;
-        struct wsp_ggml_object * next;
-        enum wsp_ggml_object_type type;
+    enum wsp_ggml_log_level {
+        WSP_GGML_LOG_LEVEL_NONE  = 0,
+        WSP_GGML_LOG_LEVEL_DEBUG = 1,
+        WSP_GGML_LOG_LEVEL_INFO  = 2,
+        WSP_GGML_LOG_LEVEL_WARN  = 3,
+        WSP_GGML_LOG_LEVEL_ERROR = 4,
+        WSP_GGML_LOG_LEVEL_CONT  = 5, // continue previous log
+    };
-        char padding[4];
+    // this tensor...
+    enum wsp_ggml_tensor_flag {
+        WSP_GGML_TENSOR_FLAG_INPUT  =  1, // ...is an input for the GGML compute graph
+        WSP_GGML_TENSOR_FLAG_OUTPUT =  2, // ...is an output for the GGML compute graph
+        WSP_GGML_TENSOR_FLAG_PARAM  =  4, // ...contains trainable parameters
+        WSP_GGML_TENSOR_FLAG_LOSS   =  8, // ...defines loss for numerical optimization (multiple loss tensors add up)
     };
-    static const size_t WSP_GGML_OBJECT_SIZE = sizeof(struct wsp_ggml_object);
+    struct wsp_ggml_init_params {
+        // memory pool
+        size_t mem_size;   // bytes
+        void * mem_buffer; // if NULL, memory will be allocated internally
+        bool   no_alloc;   // don't allocate memory for the tensor data
+    };
     // n-dimensional tensor
     struct wsp_ggml_tensor {
-        enum wsp_ggml_type    type;
-        enum wsp_ggml_backend backend;
+        enum wsp_ggml_type type;
+        struct wsp_ggml_backend_buffer * buffer;
-        int     n_dims;
         int64_t ne[WSP_GGML_MAX_DIMS]; // number of elements
         size_t  nb[WSP_GGML_MAX_DIMS]; // stride in bytes:
-                                   // nb[0] = sizeof(type)
-                                   // nb[1] = nb[0]   * ne[0] + padding
+                                   // nb[0] = wsp_ggml_type_size(type)
+                                   // nb[1] = nb[0]   * (ne[0] / wsp_ggml_blck_size(type)) + padding
                                    // nb[i] = nb[i-1] * ne[i-1]
         // compute data
@@ -477,16 +590,11 @@ extern "C" {
         // op params - allocated as int32_t for alignment
         int32_t op_params[WSP_GGML_MAX_OP_PARAMS / sizeof(int32_t)];
-        bool is_param;
+        int32_t flags;
-        struct wsp_ggml_tensor * grad;
         struct wsp_ggml_tensor * src[WSP_GGML_MAX_SRC];
-        // performance
-        int     perf_runs;
-        int64_t perf_cycles;
-        int64_t perf_time_us;
+        // source tensor and offset for views
         struct wsp_ggml_tensor * view_src;
         size_t               view_offs;
@@ -496,86 +604,26 @@ extern "C" {
         void * extra; // extra things e.g. for ggml-cuda.cu
-        char padding[4];
+        char padding[8];
     };
     static const size_t WSP_GGML_TENSOR_SIZE = sizeof(struct wsp_ggml_tensor);
-    // the compute plan that needs to be prepared for wsp_ggml_graph_compute()
-    // since https://github.com/ggerganov/ggml/issues/287
-    struct wsp_ggml_cplan {
-        size_t    work_size; // size of work buffer, calculated by `wsp_ggml_graph_plan()`
-        uint8_t * work_data; // work buffer, to be allocated by caller before calling to `wsp_ggml_graph_compute()`
-        int n_threads;
-        // the `n_tasks` of nodes, 1:1 mapping to cgraph nodes
-        int n_tasks[WSP_GGML_MAX_NODES];
-        // abort wsp_ggml_graph_compute when true
-        bool (*abort_callback)(void * data);
-        void * abort_callback_data;
-    };
-    // next prime after WSP_GGML_MAX_NODES
-    // #define WSP_GGML_GRAPH_HASHTABLE_SIZE 4099
-    // next prime after WSP_GGML_MAX_NODES * 2 (nodes + leafs)
-    #define WSP_GGML_GRAPH_HASHTABLE_SIZE 8273
-    // computation graph
-    struct wsp_ggml_cgraph {
-        int n_nodes;
-        int n_leafs;
-        struct wsp_ggml_tensor * nodes[WSP_GGML_MAX_NODES];
-        struct wsp_ggml_tensor * grads[WSP_GGML_MAX_NODES];
-        struct wsp_ggml_tensor * leafs[WSP_GGML_MAX_NODES];
+    // Abort callback
+    // If not NULL, called before ggml computation
+    // If it returns true, the computation is aborted
+    typedef bool (*wsp_ggml_abort_callback)(void * data);
-        void * visited_hash_table[WSP_GGML_GRAPH_HASHTABLE_SIZE];
-        // performance
-        int     perf_runs;
-        int64_t perf_cycles;
-        int64_t perf_time_us;
-    };
-    static const size_t WSP_GGML_GRAPH_SIZE = sizeof(struct wsp_ggml_cgraph);
-    // scratch buffer
-    struct wsp_ggml_scratch {
-        size_t offs;
-        size_t size;
-        void * data;
-    };
-    struct wsp_ggml_init_params {
-        // memory pool
-        size_t mem_size;   // bytes
-        void * mem_buffer; // if NULL, memory will be allocated internally
-        bool   no_alloc;   // don't allocate memory for the tensor data
-    };
-    // compute types
-    // NOTE: the INIT or FINALIZE pass is not scheduled unless explicitly enabled.
-    // This behavior was changed since https://github.com/ggerganov/llama.cpp/pull/1995.
-    enum wsp_ggml_task_type {
-        WSP_GGML_TASK_INIT = 0,
-        WSP_GGML_TASK_COMPUTE,
-        WSP_GGML_TASK_FINALIZE,
-    };
-    struct wsp_ggml_compute_params {
-        enum wsp_ggml_task_type type;
+    //
+    // GUID
+    //
-        // ith = thread index, nth = number of threads
-        int ith, nth;
+    // GUID types
+    typedef uint8_t wsp_ggml_guid[16];
+    typedef wsp_ggml_guid * wsp_ggml_guid_t;
-        // work buffer for all threads
-        size_t wsize;
-        void * wdata;
-    };
+    WSP_GGML_API bool wsp_ggml_guid_matches(wsp_ggml_guid_t guid_a, wsp_ggml_guid_t guid_b);
     // misc
@@ -585,26 +633,32 @@ extern "C" {
     WSP_GGML_API int64_t wsp_ggml_cycles(void);
     WSP_GGML_API int64_t wsp_ggml_cycles_per_ms(void);
-    WSP_GGML_API void    wsp_ggml_numa_init(void); // call once for better performance on NUMA systems
-    WSP_GGML_API bool    wsp_ggml_is_numa(void); // true if init detected that system has >1 NUMA node
+    // accepts a UTF-8 path, even on Windows
+    WSP_GGML_API FILE *  wsp_ggml_fopen(const char * fname, const char * mode);
     WSP_GGML_API void    wsp_ggml_print_object (const struct wsp_ggml_object * obj);
     WSP_GGML_API void    wsp_ggml_print_objects(const struct wsp_ggml_context * ctx);
-    WSP_GGML_API int64_t wsp_ggml_nelements   (const struct wsp_ggml_tensor * tensor);
-    WSP_GGML_API int64_t wsp_ggml_nrows       (const struct wsp_ggml_tensor * tensor);
-    WSP_GGML_API size_t  wsp_ggml_nbytes      (const struct wsp_ggml_tensor * tensor);
-    WSP_GGML_API size_t  wsp_ggml_nbytes_pad  (const struct wsp_ggml_tensor * tensor); // same as wsp_ggml_nbytes() but padded to WSP_GGML_MEM_ALIGN
-    WSP_GGML_API size_t  wsp_ggml_nbytes_split(const struct wsp_ggml_tensor * tensor, int nrows_split);
+    WSP_GGML_API int64_t wsp_ggml_nelements (const struct wsp_ggml_tensor * tensor);
+    WSP_GGML_API int64_t wsp_ggml_nrows     (const struct wsp_ggml_tensor * tensor);
+    WSP_GGML_API size_t  wsp_ggml_nbytes    (const struct wsp_ggml_tensor * tensor);
+    WSP_GGML_API size_t  wsp_ggml_nbytes_pad(const struct wsp_ggml_tensor * tensor); // same as wsp_ggml_nbytes() but padded to WSP_GGML_MEM_ALIGN
-    WSP_GGML_API int     wsp_ggml_blck_size (enum wsp_ggml_type type);
-    WSP_GGML_API size_t  wsp_ggml_type_size (enum wsp_ggml_type type); // size in bytes for all elements in a block
-    WSP_GGML_API float   wsp_ggml_type_sizef(enum wsp_ggml_type type); // wsp_ggml_type_size()/wsp_ggml_blck_size() as float
+    WSP_GGML_API int64_t wsp_ggml_blck_size(enum wsp_ggml_type type);
+    WSP_GGML_API size_t  wsp_ggml_type_size(enum wsp_ggml_type type);             // size in bytes for all elements in a block
+    WSP_GGML_API size_t  wsp_ggml_row_size (enum wsp_ggml_type type, int64_t ne); // size in bytes for all elements in a row
+    WSP_GGML_DEPRECATED(
+    WSP_GGML_API double wsp_ggml_type_sizef(enum wsp_ggml_type type), // wsp_ggml_type_size()/wsp_ggml_blck_size() as float
+    "use wsp_ggml_row_size() instead");
     WSP_GGML_API const char * wsp_ggml_type_name(enum wsp_ggml_type type);
     WSP_GGML_API const char * wsp_ggml_op_name  (enum wsp_ggml_op   op);
     WSP_GGML_API const char * wsp_ggml_op_symbol(enum wsp_ggml_op   op);
+    WSP_GGML_API const char * wsp_ggml_unary_op_name(enum wsp_ggml_unary_op op);
+    WSP_GGML_API const char * wsp_ggml_op_desc(const struct wsp_ggml_tensor * t); // unary or op name
     WSP_GGML_API size_t  wsp_ggml_element_size(const struct wsp_ggml_tensor * tensor);
     WSP_GGML_API bool    wsp_ggml_is_quantized(enum wsp_ggml_type type);
@@ -613,22 +667,37 @@ extern "C" {
     WSP_GGML_API enum wsp_ggml_type wsp_ggml_ftype_to_wsp_ggml_type(enum wsp_ggml_ftype ftype);
     WSP_GGML_API bool wsp_ggml_is_transposed(const struct wsp_ggml_tensor * tensor);
-    WSP_GGML_API bool wsp_ggml_is_contiguous(const struct wsp_ggml_tensor * tensor);
     WSP_GGML_API bool wsp_ggml_is_permuted  (const struct wsp_ggml_tensor * tensor);
+    WSP_GGML_API bool wsp_ggml_is_empty     (const struct wsp_ggml_tensor * tensor);
+    WSP_GGML_API bool wsp_ggml_is_scalar    (const struct wsp_ggml_tensor * tensor);
+    WSP_GGML_API bool wsp_ggml_is_vector    (const struct wsp_ggml_tensor * tensor);
+    WSP_GGML_API bool wsp_ggml_is_matrix    (const struct wsp_ggml_tensor * tensor);
+    WSP_GGML_API bool wsp_ggml_is_3d        (const struct wsp_ggml_tensor * tensor);
+    WSP_GGML_API int  wsp_ggml_n_dims       (const struct wsp_ggml_tensor * tensor); // returns 1 for scalars
-    WSP_GGML_API bool wsp_ggml_are_same_shape(const struct wsp_ggml_tensor * t0, const struct wsp_ggml_tensor * t1);
+    WSP_GGML_API bool wsp_ggml_is_contiguous  (const struct wsp_ggml_tensor * tensor);
+    WSP_GGML_API bool wsp_ggml_is_contiguous_0(const struct wsp_ggml_tensor * tensor); // same as wsp_ggml_is_contiguous()
+    WSP_GGML_API bool wsp_ggml_is_contiguous_1(const struct wsp_ggml_tensor * tensor); // contiguous for dims >= 1
+    WSP_GGML_API bool wsp_ggml_is_contiguous_2(const struct wsp_ggml_tensor * tensor); // contiguous for dims >= 2
+    WSP_GGML_API bool wsp_ggml_are_same_shape (const struct wsp_ggml_tensor * t0, const struct wsp_ggml_tensor * t1);
+    WSP_GGML_API bool wsp_ggml_are_same_stride(const struct wsp_ggml_tensor * t0, const struct wsp_ggml_tensor * t1);
+    WSP_GGML_API bool wsp_ggml_can_repeat(const struct wsp_ggml_tensor * t0, const struct wsp_ggml_tensor * t1);
     // use this to compute the memory overhead of a tensor
     WSP_GGML_API size_t wsp_ggml_tensor_overhead(void);
+    WSP_GGML_API bool wsp_ggml_validate_row_data(enum wsp_ggml_type type, const void * data, size_t nbytes);
     // main
-    WSP_GGML_API struct wsp_ggml_context * wsp_ggml_init(struct wsp_ggml_init_params params);
-    WSP_GGML_API void                  wsp_ggml_free(struct wsp_ggml_context * ctx);
+    WSP_GGML_API struct wsp_ggml_context * wsp_ggml_init (struct wsp_ggml_init_params params);
+    WSP_GGML_API void                  wsp_ggml_reset(struct wsp_ggml_context * ctx);
+    WSP_GGML_API void                  wsp_ggml_free (struct wsp_ggml_context * ctx);
     WSP_GGML_API size_t  wsp_ggml_used_mem(const struct wsp_ggml_context * ctx);
-    WSP_GGML_API size_t  wsp_ggml_set_scratch (struct wsp_ggml_context * ctx, struct wsp_ggml_scratch scratch);
     WSP_GGML_API bool    wsp_ggml_get_no_alloc(struct wsp_ggml_context * ctx);
     WSP_GGML_API void    wsp_ggml_set_no_alloc(struct wsp_ggml_context * ctx, bool no_alloc);
@@ -668,34 +737,35 @@ extern "C" {
             int64_t ne2,
             int64_t ne3);
-    WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_new_i32(struct wsp_ggml_context * ctx, int32_t value);
-    WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_new_f32(struct wsp_ggml_context * ctx, float value);
+    WSP_GGML_API void * wsp_ggml_new_buffer(struct wsp_ggml_context * ctx, size_t nbytes);
     WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_dup_tensor (struct wsp_ggml_context * ctx, const struct wsp_ggml_tensor * src);
     WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_view_tensor(struct wsp_ggml_context * ctx, struct wsp_ggml_tensor * src);
+    // Context tensor enumeration and lookup
+    WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_get_first_tensor(const struct wsp_ggml_context * ctx);
+    WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_get_next_tensor (const struct wsp_ggml_context * ctx, struct wsp_ggml_tensor * tensor);
     WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_get_tensor(struct wsp_ggml_context * ctx, const char * name);
-    WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_set_zero(struct wsp_ggml_tensor * tensor);
-    WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_set_i32 (struct wsp_ggml_tensor * tensor, int32_t value);
-    WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_set_f32 (struct wsp_ggml_tensor * tensor, float value);
-    WSP_GGML_API int32_t wsp_ggml_get_i32_1d(const struct wsp_ggml_tensor * tensor, int i);
-    WSP_GGML_API void    wsp_ggml_set_i32_1d(const struct wsp_ggml_tensor * tensor, int i, int32_t value);
+    // Converts a flat index into coordinates
+    WSP_GGML_API void wsp_ggml_unravel_index(const struct wsp_ggml_tensor * tensor, int64_t i, int64_t * i0, int64_t * i1, int64_t * i2, int64_t * i3);
-    WSP_GGML_API float   wsp_ggml_get_f32_1d(const struct wsp_ggml_tensor * tensor, int i);
-    WSP_GGML_API void    wsp_ggml_set_f32_1d(const struct wsp_ggml_tensor * tensor, int i, float value);
+    WSP_GGML_API enum wsp_ggml_unary_op wsp_ggml_get_unary_op(const struct wsp_ggml_tensor * tensor);
     WSP_GGML_API void *  wsp_ggml_get_data    (const struct wsp_ggml_tensor * tensor);
     WSP_GGML_API float * wsp_ggml_get_data_f32(const struct wsp_ggml_tensor * tensor);
-    WSP_GGML_API enum wsp_ggml_unary_op wsp_ggml_get_unary_op(const struct wsp_ggml_tensor * tensor);
     WSP_GGML_API const char *         wsp_ggml_get_name   (const struct wsp_ggml_tensor * tensor);
     WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_set_name   (      struct wsp_ggml_tensor * tensor, const char * name);
     WSP_GGML_ATTRIBUTE_FORMAT(2, 3)
     WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_format_name(      struct wsp_ggml_tensor * tensor, const char * fmt, ...);
+    // Tensor flags
+    WSP_GGML_API void wsp_ggml_set_input(struct wsp_ggml_tensor * tensor);
+    WSP_GGML_API void wsp_ggml_set_output(struct wsp_ggml_tensor * tensor);
+    WSP_GGML_API void wsp_ggml_set_param(struct wsp_ggml_context * ctx, struct wsp_ggml_tensor * tensor);
+    WSP_GGML_API void wsp_ggml_set_loss(struct wsp_ggml_tensor * tensor);
     //
     // operations on tensors with backpropagation
     //
@@ -719,6 +789,12 @@ extern "C" {
             struct wsp_ggml_tensor  * a,
             struct wsp_ggml_tensor  * b);
+    WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_add_cast(
+            struct wsp_ggml_context * ctx,
+            struct wsp_ggml_tensor  * a,
+            struct wsp_ggml_tensor  * b,
+            enum   wsp_ggml_type      type);
     WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_add1(
             struct wsp_ggml_context * ctx,
             struct wsp_ggml_tensor  * a,
@@ -729,6 +805,9 @@ extern "C" {
             struct wsp_ggml_tensor  * a,
             struct wsp_ggml_tensor  * b);
+    // dst = a
+    // view(dst, nb1, nb2, nb3, offset) += b
+    // return dst
     WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_acc(
             struct wsp_ggml_context * ctx,
             struct wsp_ggml_tensor  * a,
@@ -801,6 +880,22 @@ extern "C" {
             struct wsp_ggml_context * ctx,
             struct wsp_ggml_tensor  * a);
+    WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_sin(
+            struct wsp_ggml_context * ctx,
+            struct wsp_ggml_tensor  * a);
+    WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_sin_inplace(
+            struct wsp_ggml_context * ctx,
+            struct wsp_ggml_tensor  * a);
+    WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_cos(
+            struct wsp_ggml_context * ctx,
+            struct wsp_ggml_tensor  * a);
+    WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_cos_inplace(
+            struct wsp_ggml_context * ctx,
+            struct wsp_ggml_tensor  * a);
     // return scalar
     WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_sum(
             struct wsp_ggml_context * ctx,
@@ -821,6 +916,12 @@ extern "C" {
             struct wsp_ggml_context * ctx,
             struct wsp_ggml_tensor  * a);
+    // count number of equal elements in a and b
+    WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_count_equal(
+            struct wsp_ggml_context * ctx,
+            struct wsp_ggml_tensor  * a,
+            struct wsp_ggml_tensor  * b);
     // if a is the same shape as b, and a is not parameter, return a
     // otherwise, return a new tensor: repeat(a) to fit in b
     WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_repeat(
@@ -828,17 +929,19 @@ extern "C" {
             struct wsp_ggml_tensor  * a,
             struct wsp_ggml_tensor  * b);
+    // sums repetitions in a into shape of b
     WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_repeat_back(
             struct wsp_ggml_context * ctx,
             struct wsp_ggml_tensor  * a,
             struct wsp_ggml_tensor  * b);
-    // concat a and b on dim 2
+    // concat a and b along dim
     // used in stable-diffusion
     WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_concat(
             struct wsp_ggml_context * ctx,
             struct wsp_ggml_tensor  * a,
-            struct wsp_ggml_tensor  * b);
+            struct wsp_ggml_tensor  * b,
+            int                   dim);
     WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_abs(
             struct wsp_ggml_context * ctx,
@@ -892,11 +995,22 @@ extern "C" {
             struct wsp_ggml_context * ctx,
             struct wsp_ggml_tensor  * a);
+    WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_leaky_relu(
+            struct wsp_ggml_context * ctx,
+            struct wsp_ggml_tensor  * a, float negative_slope, bool inplace);
     WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_relu_inplace(
             struct wsp_ggml_context * ctx,
             struct wsp_ggml_tensor  * a);
-    // TODO: double-check this computation is correct
+    WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_sigmoid(
+            struct wsp_ggml_context * ctx,
+            struct wsp_ggml_tensor  * a);
+    WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_sigmoid_inplace(
+            struct wsp_ggml_context * ctx,
+            struct wsp_ggml_tensor  * a);
     WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_gelu(
             struct wsp_ggml_context * ctx,
             struct wsp_ggml_tensor  * a);
@@ -928,6 +1042,24 @@ extern "C" {
             struct wsp_ggml_tensor  * a,
             struct wsp_ggml_tensor  * b);
+    // hardswish(x) = x * relu6(x + 3) / 6
+    WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_hardswish(
+            struct wsp_ggml_context * ctx,
+            struct wsp_ggml_tensor  * a);
+    // hardsigmoid(x) = relu6(x + 3) / 6
+    WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_hardsigmoid(
+            struct wsp_ggml_context * ctx,
+            struct wsp_ggml_tensor  * a);
+    WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_exp(
+            struct wsp_ggml_context * ctx,
+            struct wsp_ggml_tensor  * a);
+    WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_exp_inplace(
+            struct wsp_ggml_context * ctx,
+            struct wsp_ggml_tensor  * a);
     // normalize along rows
     WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_norm(
             struct wsp_ggml_context * ctx,
@@ -951,16 +1083,17 @@ extern "C" {
     // group normalize along ne0*ne1*n_groups
     // used in stable-diffusion
-    // TODO: eps is hardcoded to 1e-6 for now
     WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_group_norm(
             struct wsp_ggml_context * ctx,
             struct wsp_ggml_tensor  * a,
-            int                   n_groups);
+            int                   n_groups,
+            float                 eps);
     WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_group_norm_inplace(
             struct wsp_ggml_context * ctx,
             struct wsp_ggml_tensor  * a,
-            int                   n_groups);
+            int                   n_groups,
+            float                 eps);
     // a - x
     // b - dy
@@ -970,14 +1103,27 @@ extern "C" {
             struct wsp_ggml_tensor  * b,
             float                 eps);
-    // A: n columns, m rows
-    // B: n columns, p rows  (i.e. we transpose it internally)
-    // result is m columns, p rows
+    // A: k columns, n rows => [ne03, ne02, n, k]
+    // B: k columns, m rows  (i.e. we transpose it internally) => [ne03 * x, ne02 * y, m, k]
+    // result is n columns, m rows => [ne03 * x, ne02 * y, m, n]
     WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_mul_mat(
             struct wsp_ggml_context * ctx,
             struct wsp_ggml_tensor  * a,
             struct wsp_ggml_tensor  * b);
+    // change the precision of a matrix multiplication
+    // set to WSP_GGML_PREC_F32 for higher precision (useful for phi-2)
+    WSP_GGML_API void wsp_ggml_mul_mat_set_prec(
+            struct wsp_ggml_tensor * a,
+            enum wsp_ggml_prec       prec);
+    // indirect matrix multiplication
+    WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_mul_mat_id(
+            struct wsp_ggml_context * ctx,
+            struct wsp_ggml_tensor  * as,
+            struct wsp_ggml_tensor  * b,
+            struct wsp_ggml_tensor  * ids);
     // A: m columns, n rows,
     // B: p columns, n rows,
     // result is m columns, p rows
@@ -993,13 +1139,13 @@ extern "C" {
     WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_scale(
             struct wsp_ggml_context * ctx,
             struct wsp_ggml_tensor  * a,
-            struct wsp_ggml_tensor  * b);
+            float                 s);
     // in-place, returns view(a)
     WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_scale_inplace(
             struct wsp_ggml_context * ctx,
             struct wsp_ggml_tensor  * a,
-            struct wsp_ggml_tensor  * b);
+            float                 s);
     // b -> view(a,offset,nb1,nb2,3), return modified a
     WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_set(
@@ -1009,7 +1155,7 @@ extern "C" {
             size_t                nb1,
             size_t                nb2,
             size_t                nb3,
-            size_t                offset);
+            size_t                offset); // in bytes
     // b -> view(a,offset,nb1,nb2,3), return view(a)
     WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_set_inplace(
@@ -1019,19 +1165,19 @@ extern "C" {
             size_t                nb1,
             size_t                nb2,
             size_t                nb3,
-            size_t                offset);
+            size_t                offset); // in bytes
     WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_set_1d(
             struct wsp_ggml_context * ctx,
             struct wsp_ggml_tensor  * a,
             struct wsp_ggml_tensor  * b,
-            size_t                offset);
+            size_t                offset); // in bytes
     WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_set_1d_inplace(
             struct wsp_ggml_context * ctx,
             struct wsp_ggml_tensor  * a,
             struct wsp_ggml_tensor  * b,
-            size_t                offset);
+            size_t                offset); // in bytes
     // b -> view(a,offset,nb1,nb2,3), return modified a
     WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_set_2d(
@@ -1039,7 +1185,7 @@ extern "C" {
             struct wsp_ggml_tensor  * a,
             struct wsp_ggml_tensor  * b,
             size_t                nb1,
-            size_t                offset);
+            size_t                offset); // in bytes
     // b -> view(a,offset,nb1,nb2,3), return view(a)
     WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_set_2d_inplace(
@@ -1047,8 +1193,7 @@ extern "C" {
             struct wsp_ggml_tensor  * a,
             struct wsp_ggml_tensor  * b,
             size_t                nb1,
-            size_t                offset);
+            size_t                offset); // in bytes
     // a -> b, return view(b)
     WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_cpy(
@@ -1056,21 +1201,42 @@ extern "C" {
             struct wsp_ggml_tensor  * a,
             struct wsp_ggml_tensor  * b);
-    // a -> b, in-place, return view(b)
-    WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_cpy_inplace(
+    WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_cast(
             struct wsp_ggml_context * ctx,
             struct wsp_ggml_tensor  * a,
-            struct wsp_ggml_tensor  * b);
+            enum   wsp_ggml_type      type);
     // make contiguous
     WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_cont(
             struct wsp_ggml_context * ctx,
             struct wsp_ggml_tensor  * a);
-    // make contiguous, in-place
-    WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_cont_inplace(
+    // make contiguous, with new shape
+    WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_cont_1d(
             struct wsp_ggml_context * ctx,
-            struct wsp_ggml_tensor  * a);
+            struct wsp_ggml_tensor  * a,
+            int64_t               ne0);
+    WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_cont_2d(
+            struct wsp_ggml_context * ctx,
+            struct wsp_ggml_tensor  * a,
+            int64_t               ne0,
+            int64_t               ne1);
+    WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_cont_3d(
+            struct wsp_ggml_context * ctx,
+            struct wsp_ggml_tensor  * a,
+            int64_t               ne0,
+            int64_t               ne1,
+            int64_t               ne2);
+    WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_cont_4d(
+            struct wsp_ggml_context * ctx,
+            struct wsp_ggml_tensor  * a,
+            int64_t               ne0,
+            int64_t               ne1,
+            int64_t               ne2,
+            int64_t               ne3);
     // return view(a), b specifies the new shape
     // TODO: when we start computing gradient, make a copy instead of view
@@ -1159,16 +1325,17 @@ extern "C" {
             struct wsp_ggml_context * ctx,
             struct wsp_ggml_tensor  * a);
+    // supports 3D: a->ne[2] == b->ne[1]
     WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_get_rows(
             struct wsp_ggml_context * ctx,
-            struct wsp_ggml_tensor  * a,
-            struct wsp_ggml_tensor  * b);
+            struct wsp_ggml_tensor  * a,  // data
+            struct wsp_ggml_tensor  * b); // row indices
     WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_get_rows_back(
             struct wsp_ggml_context * ctx,
-            struct wsp_ggml_tensor  * a,
-            struct wsp_ggml_tensor  * b,
-            struct wsp_ggml_tensor  * c);
+            struct wsp_ggml_tensor  * a,  // gradients of wsp_ggml_get_rows result
+            struct wsp_ggml_tensor  * b,  // row indices
+            struct wsp_ggml_tensor  * c); // data for wsp_ggml_get_rows, only used for its shape
     WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_diag(
         struct wsp_ggml_context     * ctx,
@@ -1207,105 +1374,208 @@ extern "C" {
             struct wsp_ggml_context * ctx,
             struct wsp_ggml_tensor  * a);
-    WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_soft_max_back(
+    // fused soft_max(a*scale + mask*(ALiBi slope))
+    // mask is optional
+    // max_bias = 0.0f for no ALiBi
+    WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_soft_max_ext(
             struct wsp_ggml_context * ctx,
             struct wsp_ggml_tensor  * a,
-            struct wsp_ggml_tensor  * b);
+            struct wsp_ggml_tensor  * mask,
+            float                 scale,
+            float                 max_bias);
+    WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_soft_max_ext_back(
+            struct wsp_ggml_context * ctx,
+            struct wsp_ggml_tensor  * a,
+            struct wsp_ggml_tensor  * b,
+            float                 scale,
+            float                 max_bias);
     // in-place, returns view(a)
-    WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_soft_max_back_inplace(
+    WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_soft_max_ext_back_inplace(
             struct wsp_ggml_context * ctx,
             struct wsp_ggml_tensor  * a,
-            struct wsp_ggml_tensor  * b);
+            struct wsp_ggml_tensor  * b,
+            float                 scale,
+            float                 max_bias);
     // rotary position embedding
-    // if mode & 1 == 1, skip n_past elements
-    // if mode & 2 == 1, GPT-NeoX style
-    // if mode & 4 == 1, ChatGLM style
-    // TODO: avoid creating a new tensor every time
+    // if (mode & 1) - skip n_past elements (NOT SUPPORTED)
+    // if (mode & WSP_GGML_ROPE_TYPE_NEOX) - GPT-NeoX style
+    //
+    // b is an int32 vector with size a->ne[2], it contains the positions
     WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_rope(
             struct wsp_ggml_context * ctx,
             struct wsp_ggml_tensor  * a,
-            int                   n_past,
+            struct wsp_ggml_tensor  * b,
             int                   n_dims,
-            int                   mode,
-            int                   n_ctx);
+            int                   mode);
     // in-place, returns view(a)
     WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_rope_inplace(
             struct wsp_ggml_context * ctx,
             struct wsp_ggml_tensor  * a,
-            int                   n_past,
+            struct wsp_ggml_tensor  * b,
             int                   n_dims,
-            int                   mode,
-            int                   n_ctx);
+            int                   mode);
     // custom RoPE
-    WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_rope_custom(
+    // c is freq factors (e.g. phi3-128k), (optional)
+    WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_rope_ext(
             struct wsp_ggml_context * ctx,
             struct wsp_ggml_tensor  * a,
-            int                   n_past,
+            struct wsp_ggml_tensor  * b,
+            struct wsp_ggml_tensor  * c,
             int                   n_dims,
             int                   mode,
-            int                   n_ctx,
+            int                   n_ctx_orig,
             float                 freq_base,
-            float                 freq_scale);
+            float                 freq_scale,
+            float                 ext_factor,
+            float                 attn_factor,
+            float                 beta_fast,
+            float                 beta_slow);
+    WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_rope_multi(
+            struct wsp_ggml_context * ctx,
+            struct wsp_ggml_tensor  * a,
+            struct wsp_ggml_tensor  * b,
+            struct wsp_ggml_tensor  * c,
+            int                   n_dims,
+            int                   sections[4],
+            int                   mode,
+            int                   n_ctx_orig,
+            float                 freq_base,
+            float                 freq_scale,
+            float                 ext_factor,
+            float                 attn_factor,
+            float                 beta_fast,
+            float                 beta_slow);
     // in-place, returns view(a)
-    WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_rope_custom_inplace(
+    WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_rope_ext_inplace(
+            struct wsp_ggml_context * ctx,
+            struct wsp_ggml_tensor  * a,
+            struct wsp_ggml_tensor  * b,
+            struct wsp_ggml_tensor  * c,
+            int                   n_dims,
+            int                   mode,
+            int                   n_ctx_orig,
+            float                 freq_base,
+            float                 freq_scale,
+            float                 ext_factor,
+            float                 attn_factor,
+            float                 beta_fast,
+            float                 beta_slow);
+    WSP_GGML_DEPRECATED(WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_rope_custom(
             struct wsp_ggml_context * ctx,
             struct wsp_ggml_tensor  * a,
-            int                   n_past,
+            struct wsp_ggml_tensor  * b,
             int                   n_dims,
             int                   mode,
-            int                   n_ctx,
+            int                   n_ctx_orig,
             float                 freq_base,
-            float                 freq_scale);
+            float                 freq_scale,
+            float                 ext_factor,
+            float                 attn_factor,
+            float                 beta_fast,
+            float                 beta_slow),
+        "use wsp_ggml_rope_ext instead");
-    // xPos RoPE, in-place, returns view(a)
-    WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_rope_xpos_inplace(
+    WSP_GGML_DEPRECATED(WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_rope_custom_inplace(
             struct wsp_ggml_context * ctx,
             struct wsp_ggml_tensor  * a,
-            int                   n_past,
+            struct wsp_ggml_tensor  * b,
             int                   n_dims,
-            float                 base,
-            bool                  down);
+            int                   mode,
+            int                   n_ctx_orig,
+            float                 freq_base,
+            float                 freq_scale,
+            float                 ext_factor,
+            float                 attn_factor,
+            float                 beta_fast,
+            float                 beta_slow),
+        "use wsp_ggml_rope_ext_inplace instead");
+    // compute correction dims for YaRN RoPE scaling
+    WSP_GGML_API void wsp_ggml_rope_yarn_corr_dims(
+        int n_dims, int n_ctx_orig, float freq_base, float beta_fast, float beta_slow, float dims[2]);
     // rotary position embedding backward, i.e compute dx from dy
     // a - dy
-    WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_rope_back(
+    WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_rope_ext_back(
             struct wsp_ggml_context * ctx,
-            struct wsp_ggml_tensor  * a,
-            int                   n_past,
+            struct wsp_ggml_tensor  * a, // gradients of wsp_ggml_rope result
+            struct wsp_ggml_tensor  * b, // positions
+            struct wsp_ggml_tensor  * c, // freq factors
             int                   n_dims,
             int                   mode,
-            int                   n_ctx,
+            int                   n_ctx_orig,
             float                 freq_base,
             float                 freq_scale,
-            float                 xpos_base,
-            bool                  xpos_down);
+            float                 ext_factor,
+            float                 attn_factor,
+            float                 beta_fast,
+            float                 beta_slow);
-    // alibi position embedding
-    // in-place, returns view(a)
-    struct wsp_ggml_tensor * wsp_ggml_alibi(
+    WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_rope_multi_back(
             struct wsp_ggml_context * ctx,
             struct wsp_ggml_tensor  * a,
-            int                   n_past,
-            int                   n_head,
-            float                 bias_max);
+            struct wsp_ggml_tensor  * b,
+            struct wsp_ggml_tensor  * c,
+            int                   n_dims,
+            int                   sections[4],
+            int                   mode,
+            int                   n_ctx_orig,
+            float                 freq_base,
+            float                 freq_scale,
+            float                 ext_factor,
+            float                 attn_factor,
+            float                 beta_fast,
+            float                 beta_slow);
     // clamp
     // in-place, returns view(a)
-    struct wsp_ggml_tensor * wsp_ggml_clamp(
+    WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_clamp(
             struct wsp_ggml_context * ctx,
             struct wsp_ggml_tensor  * a,
             float                 min,
             float                 max);
+    // im2col
+    // converts data into a format that effectively results in a convolution when combined with matrix multiplication
+    WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_im2col(
+            struct wsp_ggml_context * ctx,
+            struct wsp_ggml_tensor  * a,  // convolution kernel
+            struct wsp_ggml_tensor  * b,  // data
+            int                   s0, // stride dimension 0
+            int                   s1, // stride dimension 1
+            int                   p0, // padding dimension 0
+            int                   p1, // padding dimension 1
+            int                   d0, // dilation dimension 0
+            int                   d1, // dilation dimension 1
+            bool                  is_2D,
+            enum wsp_ggml_type        dst_type);
+    WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_im2col_back(
+        struct wsp_ggml_context * ctx,
+        struct wsp_ggml_tensor  * a,  // convolution kernel
+        struct wsp_ggml_tensor  * b,  // gradient of im2col output
+        int64_t             * ne, // shape of im2col input
+        int                   s0, // stride dimension 0
+        int                   s1, // stride dimension 1
+        int                   p0, // padding dimension 0
+        int                   p1, // padding dimension 1
+        int                   d0, // dilation dimension 0
+        int                   d1, // dilation dimension 1
+        bool                  is_2D);
     WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_conv_1d(
             struct wsp_ggml_context * ctx,
-            struct wsp_ggml_tensor  * a,
-            struct wsp_ggml_tensor  * b,
+            struct wsp_ggml_tensor  * a,   // convolution kernel
+            struct wsp_ggml_tensor  * b,   // data
             int                   s0,  // stride
             int                   p0,  // padding
             int                   d0); // dilation
@@ -1314,22 +1584,46 @@ extern "C" {
     // alias for wsp_ggml_conv_1d(a, b, s, a->ne[0]/2, d)
     WSP_GGML_API struct wsp_ggml_tensor* wsp_ggml_conv_1d_ph(
             struct wsp_ggml_context * ctx,
-            struct wsp_ggml_tensor  * a,
-            struct wsp_ggml_tensor  * b,
-            int                   s,
-            int                   d);
+            struct wsp_ggml_tensor  * a,  // convolution kernel
+            struct wsp_ggml_tensor  * b,  // data
+            int                   s,  // stride
+            int                   d); // dilation
-    WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_conv_2d(
+    // depthwise
+    // TODO: this is very likely wrong for some cases! - needs more testing
+    WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_conv_1d_dw(
             struct wsp_ggml_context * ctx,
-            struct wsp_ggml_tensor  * a,
-            struct wsp_ggml_tensor  * b,
-            int                   s0,
-            int                   s1,
-            int                   p0,
-            int                   p1,
-            int                   d0,
-            int                   d1);
+            struct wsp_ggml_tensor  * a,   // convolution kernel
+            struct wsp_ggml_tensor  * b,   // data
+            int                   s0,  // stride
+            int                   p0,  // padding
+            int                   d0); // dilation
+    WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_conv_1d_dw_ph(
+            struct wsp_ggml_context * ctx,
+            struct wsp_ggml_tensor  * a,   // convolution kernel
+            struct wsp_ggml_tensor  * b,   // data
+            int                   s0,  // stride
+            int                   d0); // dilation
+    WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_conv_transpose_1d(
+            struct wsp_ggml_context * ctx,
+            struct wsp_ggml_tensor  * a,   // convolution kernel
+            struct wsp_ggml_tensor  * b,   // data
+            int                   s0,  // stride
+            int                   p0,  // padding
+            int                   d0); // dilation
+    WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_conv_2d(
+            struct wsp_ggml_context * ctx,
+            struct wsp_ggml_tensor  * a,   // convolution kernel
+            struct wsp_ggml_tensor  * b,   // data
+            int                   s0,  // stride dimension 0
+            int                   s1,  // stride dimension 1
+            int                   p0,  // padding dimension 0
+            int                   p1,  // padding dimension 1
+            int                   d0,  // dilation dimension 0
+            int                   d1); // dilation dimension 1
     // kernel size is a->ne[0] x a->ne[1]
     // stride is equal to kernel size
@@ -1357,6 +1651,18 @@ extern "C" {
             struct wsp_ggml_tensor  * a,
             struct wsp_ggml_tensor  * b);
+    // depthwise
+    WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_conv_2d_dw(
+            struct wsp_ggml_context * ctx,
+            struct wsp_ggml_tensor  * a,  // convolution kernel
+            struct wsp_ggml_tensor  * b,  // data
+            int                  s0,  // stride dimension 0
+            int                  s1,  // stride dimension 1
+            int                  p0,  // padding dimension 0
+            int                  p1,  // padding dimension 1
+            int                  d0,  // dilation dimension 0
+            int                  d1); // dilation dimension 1
     WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_conv_transpose_2d_p0(
             struct wsp_ggml_context * ctx,
             struct wsp_ggml_tensor  * a,
@@ -1377,6 +1683,8 @@ extern "C" {
             int                   s0, // stride
             int                   p0); // padding
+    // the result will have 2*p0 padding for the first dimension
+    // and 2*p1 padding for the second dimension
     WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_pool_2d(
             struct wsp_ggml_context * ctx,
             struct wsp_ggml_tensor  * a,
@@ -1385,23 +1693,113 @@ extern "C" {
             int                   k1,
             int                   s0,
             int                   s1,
-            int                   p0,
-            int                   p1);
+            float                 p0,
+            float                 p1);
+    WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_pool_2d_back(
+            struct wsp_ggml_context * ctx,
+            struct wsp_ggml_tensor  * a,
+            struct wsp_ggml_tensor  * af, // "a"/input used in forward pass
+            enum wsp_ggml_op_pool     op,
+            int                   k0,
+            int                   k1,
+            int                   s0,
+            int                   s1,
+            float                 p0,
+            float                 p1);
     // nearest interpolate
+    // multiplies ne0 and ne1 by scale factor
     // used in stable-diffusion
     WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_upscale(
             struct wsp_ggml_context * ctx,
             struct wsp_ggml_tensor  * a,
             int                   scale_factor);
-    WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_flash_attn(
+    // nearest interpolate
+    // nearest interpolate to specified dimensions
+    // used in tortoise.cpp
+    WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_upscale_ext(
+            struct wsp_ggml_context * ctx,
+            struct wsp_ggml_tensor  * a,
+            int                   ne0,
+            int                   ne1,
+            int                   ne2,
+            int                   ne3);
+    // pad each dimension with zeros: [x, ..., x] -> [x, ..., x, 0, ..., 0]
+    WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_pad(
+            struct wsp_ggml_context * ctx,
+            struct wsp_ggml_tensor  * a,
+            int                  p0,
+            int                  p1,
+            int                  p2,
+            int                  p3);
+    // pad each dimension with reflection: [a, b, c, d] -> [b, a, b, c, d, c]
+    WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_pad_reflect_1d(
+            struct wsp_ggml_context * ctx,
+            struct wsp_ggml_tensor  * a,
+            int                   p0,
+            int                   p1);
+    // Ref: https://github.com/CompVis/stable-diffusion/blob/main/ldm/modules/diffusionmodules/util.py#L151
+    // timesteps: [N,]
+    // return: [N, dim]
+    WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_timestep_embedding(
+            struct wsp_ggml_context * ctx,
+            struct wsp_ggml_tensor  * timesteps,
+            int                   dim,
+            int                   max_period);
+    // sort rows
+    enum wsp_ggml_sort_order {
+        WSP_GGML_SORT_ORDER_ASC,
+        WSP_GGML_SORT_ORDER_DESC,
+    };
+    WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_argsort(
+            struct wsp_ggml_context * ctx,
+            struct wsp_ggml_tensor  * a,
+            enum wsp_ggml_sort_order  order);
+    WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_arange(
+            struct wsp_ggml_context * ctx,
+            float                 start,
+            float                 stop,
+            float                 step);
+    // top k elements per row
+    WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_top_k(
+            struct wsp_ggml_context * ctx,
+            struct wsp_ggml_tensor  * a,
+            int                   k);
+#define WSP_GGML_KQ_MASK_PAD 64
+    // q:    [n_embd, n_batch,     n_head,    1]
+    // k:    [n_embd, n_kv,        n_head_kv, 1]
+    // v:    [n_embd, n_kv,        n_head_kv, 1] !! not transposed !!
+    // mask: [n_kv,   n_batch_pad, 1,         1] !! n_batch_pad = WSP_GGML_PAD(n_batch, WSP_GGML_KQ_MASK_PAD) !!
+    // res:  [n_embd, n_head,      n_batch,   1] !! permuted !!
+    WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_flash_attn_ext(
             struct wsp_ggml_context * ctx,
             struct wsp_ggml_tensor  * q,
             struct wsp_ggml_tensor  * k,
             struct wsp_ggml_tensor  * v,
-            bool                  masked);
+            struct wsp_ggml_tensor  * mask,
+            float                 scale,
+            float                 max_bias,
+            float                 logit_softcap);
+    WSP_GGML_API void wsp_ggml_flash_attn_ext_set_prec(
+            struct wsp_ggml_tensor * a,
+            enum wsp_ggml_prec       prec);
+    WSP_GGML_API enum wsp_ggml_prec wsp_ggml_flash_attn_ext_get_prec(
+            const struct wsp_ggml_tensor * a);
+    // TODO: needs to be adapted to wsp_ggml_flash_attn_ext
     WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_flash_attn_back(
            struct wsp_ggml_context * ctx,
            struct wsp_ggml_tensor  * q,
@@ -1410,13 +1808,19 @@ extern "C" {
            struct wsp_ggml_tensor  * d,
            bool                  masked);
-    WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_flash_ff(
+    WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_ssm_conv(
             struct wsp_ggml_context * ctx,
-            struct wsp_ggml_tensor  * a,
-            struct wsp_ggml_tensor  * b0,
-            struct wsp_ggml_tensor  * b1,
-            struct wsp_ggml_tensor  * c0,
-            struct wsp_ggml_tensor  * c1);
+            struct wsp_ggml_tensor  * sx,
+            struct wsp_ggml_tensor  * c);
+    WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_ssm_scan(
+            struct wsp_ggml_context * ctx,
+            struct wsp_ggml_tensor  * s,
+            struct wsp_ggml_tensor  * x,
+            struct wsp_ggml_tensor  * dt,
+            struct wsp_ggml_tensor  * A,
+            struct wsp_ggml_tensor  * B,
+            struct wsp_ggml_tensor  * C);
     // partition into non-overlapping windows with padding if needed
     // example:
@@ -1456,7 +1860,6 @@ extern "C" {
             int                   kh);
     // used in sam
     WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_add_rel_pos(
             struct wsp_ggml_context * ctx,
             struct wsp_ggml_tensor  * a,
@@ -1469,6 +1872,24 @@ extern "C" {
             struct wsp_ggml_tensor  * pw,
             struct wsp_ggml_tensor  * ph);
+    WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_rwkv_wkv6(
+            struct wsp_ggml_context * ctx,
+            struct wsp_ggml_tensor  * k,
+            struct wsp_ggml_tensor  * v,
+            struct wsp_ggml_tensor  * r,
+            struct wsp_ggml_tensor  * tf,
+            struct wsp_ggml_tensor  * td,
+            struct wsp_ggml_tensor  * state);
+    WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_gated_linear_attn(
+            struct wsp_ggml_context * ctx,
+            struct wsp_ggml_tensor  * k,
+            struct wsp_ggml_tensor  * v,
+            struct wsp_ggml_tensor  * q,
+            struct wsp_ggml_tensor  * g,
+            struct wsp_ggml_tensor  * state,
+            float scale);
     // custom operators
     typedef void (*wsp_ggml_unary_op_f32_t) (const int, float *, const float *);
@@ -1552,7 +1973,8 @@ extern "C" {
     typedef void (*wsp_ggml_custom2_op_t)(struct wsp_ggml_tensor * dst , const struct wsp_ggml_tensor * a, const struct wsp_ggml_tensor * b, int ith, int nth, void * userdata);
     typedef void (*wsp_ggml_custom3_op_t)(struct wsp_ggml_tensor * dst , const struct wsp_ggml_tensor * a, const struct wsp_ggml_tensor * b, const struct wsp_ggml_tensor * c, int ith, int nth, void * userdata);
-    #define WSP_GGML_N_TASKS_MAX -1
+#define WSP_GGML_N_TASKS_MAX (-1)
+    // n_tasks == WSP_GGML_N_TASKS_MAX means to use max number of tasks
     WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_map_custom1(
             struct wsp_ggml_context   * ctx,
@@ -1605,50 +2027,62 @@ extern "C" {
     // loss function
     WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_cross_entropy_loss(
-            struct wsp_ggml_context         * ctx,
-            struct wsp_ggml_tensor          * a,
-            struct wsp_ggml_tensor          * b);
+            struct wsp_ggml_context * ctx,
+            struct wsp_ggml_tensor  * a,  // logits
+            struct wsp_ggml_tensor  * b); // labels
     WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_cross_entropy_loss_back(
-            struct wsp_ggml_context         * ctx,
-            struct wsp_ggml_tensor          * a,
-            struct wsp_ggml_tensor          * b,
-            struct wsp_ggml_tensor          * c);
+            struct wsp_ggml_context * ctx,
+            struct wsp_ggml_tensor  * a,  // logits
+            struct wsp_ggml_tensor  * b,  // labels
+            struct wsp_ggml_tensor  * c); // gradients of cross_entropy_loss result
+    // AdamW optimizer step
+    // Paper: https://arxiv.org/pdf/1711.05101v3.pdf
+    // PyTorch: https://pytorch.org/docs/stable/generated/torch.optim.AdamW.html
+    WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_opt_step_adamw(
+            struct wsp_ggml_context * ctx,
+            struct wsp_ggml_tensor  * a,
+            struct wsp_ggml_tensor  * grad,
+            struct wsp_ggml_tensor  * m,
+            struct wsp_ggml_tensor  * v,
+            struct wsp_ggml_tensor  * adamw_params); // parameters such a the learning rate
     //
     // automatic differentiation
     //
-    WSP_GGML_API void wsp_ggml_set_param(
-            struct wsp_ggml_context * ctx,
-            struct wsp_ggml_tensor  * tensor);
+    WSP_GGML_API void wsp_ggml_build_forward_expand(struct wsp_ggml_cgraph * cgraph, struct wsp_ggml_tensor * tensor);
+    WSP_GGML_API void wsp_ggml_build_backward_expand(
+        struct wsp_ggml_context * ctx_static,  // context for static gradients (loss + gradient accumulation)
+        struct wsp_ggml_context * ctx_compute, // context for gradient computation
+        struct wsp_ggml_cgraph  * cgraph,
+        bool                  accumulate); // whether or not gradients should be accumulated, requires static allocation of tensors in ctx_static
+    // graph allocation in a context
+    WSP_GGML_API struct wsp_ggml_cgraph * wsp_ggml_new_graph       (struct wsp_ggml_context * ctx); // size = WSP_GGML_DEFAULT_GRAPH_SIZE, grads = false
+    WSP_GGML_API struct wsp_ggml_cgraph * wsp_ggml_new_graph_custom(struct wsp_ggml_context * ctx, size_t size, bool grads);
+    WSP_GGML_API struct wsp_ggml_cgraph * wsp_ggml_graph_dup       (struct wsp_ggml_context * ctx, struct wsp_ggml_cgraph * cgraph);
+    WSP_GGML_API void                 wsp_ggml_graph_cpy       (struct wsp_ggml_cgraph * src, struct wsp_ggml_cgraph * dst);
+    WSP_GGML_API void                 wsp_ggml_graph_reset     (struct wsp_ggml_cgraph * cgraph); // set regular grads + optimizer momenta to 0, set loss grad to 1
+    WSP_GGML_API void                 wsp_ggml_graph_clear     (struct wsp_ggml_cgraph * cgraph);
-    WSP_GGML_API void wsp_ggml_build_forward_expand (struct wsp_ggml_cgraph * cgraph, struct wsp_ggml_tensor * tensor);
-    WSP_GGML_API void wsp_ggml_build_backward_expand(struct wsp_ggml_context * ctx, struct wsp_ggml_cgraph * gf, struct wsp_ggml_cgraph * gb, bool keep);
+    WSP_GGML_API int                   wsp_ggml_graph_size   (struct wsp_ggml_cgraph * cgraph);
+    WSP_GGML_API struct wsp_ggml_tensor *  wsp_ggml_graph_node   (struct wsp_ggml_cgraph * cgraph, int i); // if i < 0, returns nodes[n_nodes + i]
+    WSP_GGML_API struct wsp_ggml_tensor ** wsp_ggml_graph_nodes  (struct wsp_ggml_cgraph * cgraph);
+    WSP_GGML_API int                   wsp_ggml_graph_n_nodes(struct wsp_ggml_cgraph * cgraph);
-    WSP_GGML_API struct wsp_ggml_cgraph wsp_ggml_build_forward (struct wsp_ggml_tensor * tensor);
-    WSP_GGML_API struct wsp_ggml_cgraph wsp_ggml_build_backward(struct wsp_ggml_context * ctx, struct wsp_ggml_cgraph * gf, bool keep);
+    WSP_GGML_API void   wsp_ggml_graph_add_node(struct wsp_ggml_cgraph * cgraph, struct wsp_ggml_tensor * tensor);
-    // graph allocation in a context
-    WSP_GGML_API struct wsp_ggml_cgraph * wsp_ggml_new_graph        (struct wsp_ggml_context * ctx);
-    WSP_GGML_API struct wsp_ggml_cgraph * wsp_ggml_build_forward_ctx(struct wsp_ggml_context * ctx, struct wsp_ggml_tensor * tensor);
     WSP_GGML_API size_t wsp_ggml_graph_overhead(void);
+    WSP_GGML_API size_t wsp_ggml_graph_overhead_custom(size_t size, bool grads);
-    // wsp_ggml_graph_plan() has to be called before wsp_ggml_graph_compute()
-    // when plan.work_size > 0, caller must allocate memory for plan.work_data
-    WSP_GGML_API struct wsp_ggml_cplan wsp_ggml_graph_plan   (struct wsp_ggml_cgraph * cgraph, int n_threads /*= WSP_GGML_DEFAULT_N_THREADS*/);
-    WSP_GGML_API               int wsp_ggml_graph_compute(struct wsp_ggml_cgraph * cgraph, struct wsp_ggml_cplan * cplan);
-    WSP_GGML_API              void wsp_ggml_graph_reset  (struct wsp_ggml_cgraph * cgraph);
-    // same as wsp_ggml_graph_compute() but the work data is allocated as a part of the context
-    // note: the drawback of this API is that you must have ensured that the context has enough memory for the work data
-    WSP_GGML_API void wsp_ggml_graph_compute_with_ctx(struct wsp_ggml_context * ctx, struct wsp_ggml_cgraph * cgraph, int n_threads);
+    WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_graph_get_tensor  (const struct wsp_ggml_cgraph * cgraph, const char * name);
+    WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_graph_get_grad    (const struct wsp_ggml_cgraph * cgraph, const struct wsp_ggml_tensor * node);
+    WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_graph_get_grad_acc(const struct wsp_ggml_cgraph * cgraph, const struct wsp_ggml_tensor * node);
-    WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_graph_get_tensor(struct wsp_ggml_cgraph * cgraph, const char * name);
-    WSP_GGML_API void               wsp_ggml_graph_export(const struct wsp_ggml_cgraph * cgraph, const char * fname);
-    WSP_GGML_API struct wsp_ggml_cgraph wsp_ggml_graph_import(const char * fname, struct wsp_ggml_context ** ctx_data, struct wsp_ggml_context ** ctx_eval);
+    WSP_GGML_API void                 wsp_ggml_graph_export(const struct wsp_ggml_cgraph * cgraph, const char * fname);
+    WSP_GGML_API struct wsp_ggml_cgraph * wsp_ggml_graph_import(const char * fname, struct wsp_ggml_context ** ctx_data, struct wsp_ggml_context ** ctx_eval);
     // print info and performance information for the graph
     WSP_GGML_API void wsp_ggml_graph_print(const struct wsp_ggml_cgraph * cgraph);
@@ -1656,359 +2090,103 @@ extern "C" {
     // dump the graph into a file using the dot format
     WSP_GGML_API void wsp_ggml_graph_dump_dot(const struct wsp_ggml_cgraph * gb, const struct wsp_ggml_cgraph * gf, const char * filename);
-    //
-    // optimization
-    //
-    // optimization methods
-    enum wsp_ggml_opt_type {
-        WSP_GGML_OPT_ADAM,
-        WSP_GGML_OPT_LBFGS,
-    };
-    // linesearch methods
-    enum wsp_ggml_linesearch {
-        WSP_GGML_LINESEARCH_DEFAULT = 1,
-        WSP_GGML_LINESEARCH_BACKTRACKING_ARMIJO       = 0,
-        WSP_GGML_LINESEARCH_BACKTRACKING_WOLFE        = 1,
-        WSP_GGML_LINESEARCH_BACKTRACKING_STRONG_WOLFE = 2,
-    };
-    // optimization return values
-    enum wsp_ggml_opt_result {
-        WSP_GGML_OPT_OK = 0,
-        WSP_GGML_OPT_DID_NOT_CONVERGE,
-        WSP_GGML_OPT_NO_CONTEXT,
-        WSP_GGML_OPT_INVALID_WOLFE,
-        WSP_GGML_OPT_FAIL,
-        WSP_GGML_LINESEARCH_FAIL = -128,
-        WSP_GGML_LINESEARCH_MINIMUM_STEP,
-        WSP_GGML_LINESEARCH_MAXIMUM_STEP,
-        WSP_GGML_LINESEARCH_MAXIMUM_ITERATIONS,
-        WSP_GGML_LINESEARCH_INVALID_PARAMETERS,
-    };
-    typedef void (*wsp_ggml_opt_callback)(void * data, float * sched);
-    // optimization parameters
-    //
-    //   see ggml.c (wsp_ggml_opt_default_params) for default values
-    //
-    struct wsp_ggml_opt_params {
-        enum wsp_ggml_opt_type type;
-        int n_threads;
-        // delta-based convergence test
-        //
-        //   if past == 0 - disabled
-        //   if past > 0:
-        //     stop if |f(x) - f(x_past)| < delta * max(1, |f(x)|)
-        //
-        int past;
-        float delta;
-        // maximum number of iterations without improvement
-        //
-        //   if 0 - disabled
-        //   if > 0:
-        //     assume convergence if no cost improvement in this number of iterations
-        //
-        int max_no_improvement;
-        bool print_forward_graph;
-        bool print_backward_graph;
-        // ADAM parameters
-        struct {
-            int n_iter;
-            float sched; // schedule multiplier (fixed, decay or warmup)
-            float decay; // weight decay for AdamW, use 0.0f to disable
-            int   decay_min_ndim; // minimum number of tensor dimension to apply weight decay
-            float alpha; // learning rate
-            float beta1;
-            float beta2;
-            float eps;   // epsilon for numerical stability
-            float eps_f; // epsilon for convergence test
-            float eps_g; // epsilon for convergence test
-            float gclip; // gradient clipping
-        } adam;
-        // LBFGS parameters
-        struct {
-            int m; // number of corrections to approximate the inv. Hessian
-            int n_iter;
-            int max_linesearch;
-            float eps;      // convergence tolerance
-            float ftol;     // line search tolerance
-            float wolfe;
-            float min_step;
-            float max_step;
-            enum wsp_ggml_linesearch linesearch;
-        } lbfgs;
-    };
-    struct wsp_ggml_opt_context {
-        struct wsp_ggml_context * ctx;
-        struct wsp_ggml_opt_params params;
-        int iter;
-        int64_t nx; // number of parameter elements
-        bool just_initialized;
-        float loss_before;
-        float loss_after;
-        struct {
-            struct wsp_ggml_tensor * m;  // first moment
-            struct wsp_ggml_tensor * v;  // second moment
-            struct wsp_ggml_tensor * pf; // past function values
-            float fx_best;
-            float fx_prev;
-            int n_no_improvement;
-        } adam;
-        struct {
-            struct wsp_ggml_tensor * x;    // current parameters
-            struct wsp_ggml_tensor * xp;   // previous parameters
-            struct wsp_ggml_tensor * g;    // current gradient
-            struct wsp_ggml_tensor * gp;   // previous gradient
-            struct wsp_ggml_tensor * d;    // search direction
-            struct wsp_ggml_tensor * pf;   // past function values
-            struct wsp_ggml_tensor * lmal; // the L-BFGS memory alpha
-            struct wsp_ggml_tensor * lmys; // the L-BFGS memory ys
-            struct wsp_ggml_tensor * lms;  // the L-BFGS memory s
-            struct wsp_ggml_tensor * lmy;  // the L-BFGS memory y
-            float fx_best;
-            float step;
-            int j;
-            int k;
-            int end;
-            int n_no_improvement;
-        } lbfgs;
-    };
-    WSP_GGML_API struct wsp_ggml_opt_params wsp_ggml_opt_default_params(enum wsp_ggml_opt_type type);
-    // optimize the function defined by the tensor f
-    WSP_GGML_API enum wsp_ggml_opt_result wsp_ggml_opt(
-            struct wsp_ggml_context * ctx,
-            struct wsp_ggml_opt_params params,
-            struct wsp_ggml_tensor * f);
-    // initialize optimizer context
-    WSP_GGML_API void wsp_ggml_opt_init(
-            struct wsp_ggml_context     * ctx,
-            struct wsp_ggml_opt_context * opt,
-            struct wsp_ggml_opt_params    params,
-            int64_t                   nx);
+    // TODO these functions were sandwiched in the old optimization interface, is there a better place for them?
+    typedef void (*wsp_ggml_log_callback)(enum wsp_ggml_log_level level, const char * text, void * user_data);
-    // continue optimizing the function defined by the tensor f
-    WSP_GGML_API enum wsp_ggml_opt_result wsp_ggml_opt_resume(
-            struct wsp_ggml_context * ctx,
-            struct wsp_ggml_opt_context * opt,
-            struct wsp_ggml_tensor * f);
+    // Set callback for all future logging events.
+    // If this is not called, or NULL is supplied, everything is output on stderr.
+    WSP_GGML_API void wsp_ggml_log_set(wsp_ggml_log_callback log_callback, void * user_data);
-    // continue optimizing the function defined by the tensor f
-    WSP_GGML_API enum wsp_ggml_opt_result wsp_ggml_opt_resume_g(
-            struct wsp_ggml_context * ctx,
-            struct wsp_ggml_opt_context * opt,
-            struct wsp_ggml_tensor * f,
-            struct wsp_ggml_cgraph * gf,
-            struct wsp_ggml_cgraph * gb,
-            wsp_ggml_opt_callback callback,
-            void * callback_data);
+    WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_set_zero(struct wsp_ggml_tensor * tensor);
     //
     // quantization
     //
-    WSP_GGML_API size_t wsp_ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist);
-    WSP_GGML_API size_t wsp_ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist);
-    WSP_GGML_API size_t wsp_ggml_quantize_q5_0(const float * src, void * dst, int n, int k, int64_t * hist);
-    WSP_GGML_API size_t wsp_ggml_quantize_q5_1(const float * src, void * dst, int n, int k, int64_t * hist);
-    WSP_GGML_API size_t wsp_ggml_quantize_q8_0(const float * src, void * dst, int n, int k, int64_t * hist);
-    WSP_GGML_API size_t wsp_ggml_quantize_chunk(enum wsp_ggml_type type, const float * src, void * dst, int start, int n, int64_t * hist);
+    // - wsp_ggml_wsp_quantize_init can be called multiple times with the same type
+    //   it will only initialize the quantization tables for the first call or after wsp_ggml_wsp_quantize_free
+    //   automatically called by wsp_ggml_wsp_quantize_chunk for convenience
     //
-    // gguf
+    // - wsp_ggml_wsp_quantize_free will free any memory allocated by wsp_ggml_wsp_quantize_init
+    //   call this at the end of the program to avoid memory leaks
     //
-    enum gguf_type {
-        GGUF_TYPE_UINT8   = 0,
-        GGUF_TYPE_INT8    = 1,
-        GGUF_TYPE_UINT16  = 2,
-        GGUF_TYPE_INT16   = 3,
-        GGUF_TYPE_UINT32  = 4,
-        GGUF_TYPE_INT32   = 5,
-        GGUF_TYPE_FLOAT32 = 6,
-        GGUF_TYPE_BOOL    = 7,
-        GGUF_TYPE_STRING  = 8,
-        GGUF_TYPE_ARRAY   = 9,
-        GGUF_TYPE_UINT64  = 10,
-        GGUF_TYPE_INT64   = 11,
-        GGUF_TYPE_FLOAT64 = 12,
-        GGUF_TYPE_COUNT,       // marks the end of the enum
+    // note: these are thread-safe
+    //
+    WSP_GGML_API void wsp_ggml_wsp_quantize_init(enum wsp_ggml_type type);
+    WSP_GGML_API void wsp_ggml_wsp_quantize_free(void);
+    // some quantization type cannot be used without an importance matrix
+    WSP_GGML_API bool wsp_ggml_wsp_quantize_requires_imatrix(enum wsp_ggml_type type);
+    // calls wsp_ggml_wsp_quantize_init internally (i.e. can allocate memory)
+    WSP_GGML_API size_t wsp_ggml_wsp_quantize_chunk(
+            enum wsp_ggml_type   type,
+               const float * src,
+                      void * dst,
+                   int64_t   start,
+                   int64_t   nrows,
+                   int64_t   n_per_row,
+               const float * imatrix);
+#ifdef __cplusplus
+    // restrict not standard in C++
+#    if defined(__GNUC__)
+#        define WSP_GGML_RESTRICT __restrict__
+#    elif defined(__clang__)
+#        define WSP_GGML_RESTRICT __restrict
+#    elif defined(_MSC_VER)
+#        define WSP_GGML_RESTRICT __restrict
+#    else
+#        define WSP_GGML_RESTRICT
+#    endif
+#else
+#    define WSP_GGML_RESTRICT restrict
+#endif
+    typedef void (*wsp_ggml_to_float_t)  (const void  * WSP_GGML_RESTRICT x, float * WSP_GGML_RESTRICT y, int64_t k);
+    typedef void (*wsp_ggml_from_float_t)(const float * WSP_GGML_RESTRICT x, void  * WSP_GGML_RESTRICT y, int64_t k);
+    struct wsp_ggml_type_traits {
+        const char             * type_name;
+        int64_t                  blck_size;
+        int64_t                  blck_size_interleave; // interleave elements in blocks
+        size_t                   type_size;
+        bool                     is_quantized;
+        wsp_ggml_to_float_t          to_float;
+        wsp_ggml_from_float_t        from_float_ref;
     };
-    struct gguf_context;
+    WSP_GGML_API const struct wsp_ggml_type_traits * wsp_ggml_get_type_traits(enum wsp_ggml_type type);
-    struct gguf_init_params {
-        bool no_alloc;
+    // ggml threadpool
+    // TODO: currently, only a few functions are in the base ggml API, while the rest are in the CPU backend
+    // the goal should be to create an API that other backends can use move everything to the ggml base
-        // if not NULL, create a wsp_ggml_context and allocate the tensor data in it
-        struct wsp_ggml_context ** ctx;
+    // scheduling priorities
+    enum wsp_ggml_sched_priority {
+        WSP_GGML_SCHED_PRIO_NORMAL,
+        WSP_GGML_SCHED_PRIO_MEDIUM,
+        WSP_GGML_SCHED_PRIO_HIGH,
+        WSP_GGML_SCHED_PRIO_REALTIME
     };
-    WSP_GGML_API struct gguf_context * gguf_init_empty(void);
-    WSP_GGML_API struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_params params);
-    //WSP_GGML_API struct gguf_context * gguf_init_from_buffer(..);
-    WSP_GGML_API void gguf_free(struct gguf_context * ctx);
-    WSP_GGML_API const char * gguf_type_name(enum gguf_type type);
-    WSP_GGML_API int    gguf_get_version    (const struct gguf_context * ctx);
-    WSP_GGML_API size_t gguf_get_alignment  (const struct gguf_context * ctx);
-    WSP_GGML_API size_t gguf_get_data_offset(const struct gguf_context * ctx);
-    WSP_GGML_API void * gguf_get_data       (const struct gguf_context * ctx);
-    WSP_GGML_API int          gguf_get_n_kv(const struct gguf_context * ctx);
-    WSP_GGML_API int          gguf_find_key(const struct gguf_context * ctx, const char * key);
-    WSP_GGML_API const char * gguf_get_key (const struct gguf_context * ctx, int i);
-    WSP_GGML_API enum gguf_type gguf_get_kv_type (const struct gguf_context * ctx, int i);
-    WSP_GGML_API enum gguf_type gguf_get_arr_type(const struct gguf_context * ctx, int i);
-    // results are undefined if the wrong type is used for the key
-    WSP_GGML_API uint8_t      gguf_get_val_u8  (const struct gguf_context * ctx, int i);
-    WSP_GGML_API int8_t       gguf_get_val_i8  (const struct gguf_context * ctx, int i);
-    WSP_GGML_API uint16_t     gguf_get_val_u16 (const struct gguf_context * ctx, int i);
-    WSP_GGML_API int16_t      gguf_get_val_i16 (const struct gguf_context * ctx, int i);
-    WSP_GGML_API uint32_t     gguf_get_val_u32 (const struct gguf_context * ctx, int i);
-    WSP_GGML_API int32_t      gguf_get_val_i32 (const struct gguf_context * ctx, int i);
-    WSP_GGML_API float        gguf_get_val_f32 (const struct gguf_context * ctx, int i);
-    WSP_GGML_API uint64_t     gguf_get_val_u64 (const struct gguf_context * ctx, int i);
-    WSP_GGML_API int64_t      gguf_get_val_i64 (const struct gguf_context * ctx, int i);
-    WSP_GGML_API double       gguf_get_val_f64 (const struct gguf_context * ctx, int i);
-    WSP_GGML_API bool         gguf_get_val_bool(const struct gguf_context * ctx, int i);
-    WSP_GGML_API const char * gguf_get_val_str (const struct gguf_context * ctx, int i);
-    WSP_GGML_API int          gguf_get_arr_n   (const struct gguf_context * ctx, int i);
-    WSP_GGML_API const void * gguf_get_arr_data(const struct gguf_context * ctx, int i);
-    WSP_GGML_API const char * gguf_get_arr_str (const struct gguf_context * ctx, int key_id, int i);
-    WSP_GGML_API int    gguf_get_n_tensors    (const struct gguf_context * ctx);
-    WSP_GGML_API int    gguf_find_tensor      (const struct gguf_context * ctx, const char * name);
-    WSP_GGML_API size_t gguf_get_tensor_offset(const struct gguf_context * ctx, int i);
-    WSP_GGML_API char * gguf_get_tensor_name  (const struct gguf_context * ctx, int i);
-    // overrides existing values or adds a new one
-    WSP_GGML_API void gguf_set_val_u8  (struct gguf_context * ctx, const char * key, uint8_t  val);
-    WSP_GGML_API void gguf_set_val_i8  (struct gguf_context * ctx, const char * key, int8_t   val);
-    WSP_GGML_API void gguf_set_val_u16 (struct gguf_context * ctx, const char * key, uint16_t val);
-    WSP_GGML_API void gguf_set_val_i16 (struct gguf_context * ctx, const char * key, int16_t  val);
-    WSP_GGML_API void gguf_set_val_u32 (struct gguf_context * ctx, const char * key, uint32_t val);
-    WSP_GGML_API void gguf_set_val_i32 (struct gguf_context * ctx, const char * key, int32_t  val);
-    WSP_GGML_API void gguf_set_val_f32 (struct gguf_context * ctx, const char * key, float    val);
-    WSP_GGML_API void gguf_set_val_u64 (struct gguf_context * ctx, const char * key, uint64_t val);
-    WSP_GGML_API void gguf_set_val_i64 (struct gguf_context * ctx, const char * key, int64_t  val);
-    WSP_GGML_API void gguf_set_val_f64 (struct gguf_context * ctx, const char * key, double   val);
-    WSP_GGML_API void gguf_set_val_bool(struct gguf_context * ctx, const char * key, bool     val);
-    WSP_GGML_API void gguf_set_val_str (struct gguf_context * ctx, const char * key, const char * val);
-    WSP_GGML_API void gguf_set_arr_data(struct gguf_context * ctx, const char * key, enum gguf_type type, const void * data, int n);
-    WSP_GGML_API void gguf_set_arr_str (struct gguf_context * ctx, const char * key, const char ** data, int n);
-    // set or add KV pairs from another context
-    WSP_GGML_API void gguf_set_kv(struct gguf_context * ctx, struct gguf_context * src);
-    // manage tensor info
-    WSP_GGML_API void gguf_add_tensor(struct gguf_context * ctx, const struct wsp_ggml_tensor * tensor);
-    WSP_GGML_API void gguf_set_tensor_type(struct gguf_context * ctx, const char * name, enum wsp_ggml_type type);
-    WSP_GGML_API void gguf_set_tensor_data(struct gguf_context * ctx, const char * name, const void * data, size_t size);
-    // writing gguf files can be done in 2 ways:
-    //
-    // - write the entire gguf_context to a binary file in a single pass:
-    //
-    //   gguf_write_to_file(ctx, fname);
-    //
-    // - first prepare a file with a placeholder for the meta data, write the tensor data, then write the meta data:
-    //
-    //   FILE * f = fopen(fname, "wb");
-    //   fseek(f, gguf_get_meta_size(ctx), SEEK_SET);
-    //   fwrite(f, ...);
-    //   void * data = gguf_meta_get_meta_data(ctx);
-    //   fseek(f, 0, SEEK_SET);
-    //   fwrite(f, data, gguf_get_meta_size(ctx));
-    //   free(data);
-    //   fclose(f);
-    //
-    // write the entire context to a binary file
-    WSP_GGML_API void gguf_write_to_file(const struct gguf_context * ctx, const char * fname, bool only_meta);
-    // get the size in bytes of the meta data (header, kv pairs, tensor info) including padding
-    WSP_GGML_API size_t gguf_get_meta_size(const struct gguf_context * ctx);
-    WSP_GGML_API void   gguf_get_meta_data(const struct gguf_context * ctx, void * data);
-    //
-    // system info
-    //
+    // threadpool params
+    // Use wsp_ggml_threadpool_params_default() or wsp_ggml_threadpool_params_init() to populate the defaults
+    struct wsp_ggml_threadpool_params {
+        bool                cpumask[WSP_GGML_MAX_N_THREADS]; // mask of cpu cores (all-zeros means use default affinity settings)
+        int                 n_threads;                   // number of threads
+        enum wsp_ggml_sched_priority prio;                   // thread priority
+        uint32_t            poll;                        // polling level (0 - no polling, 100 - aggressive polling)
+        bool                strict_cpu;                  // strict cpu placement
+        bool                paused;                      // start in paused state
+    };
-    WSP_GGML_API int wsp_ggml_cpu_has_avx        (void);
-    WSP_GGML_API int wsp_ggml_cpu_has_avx2       (void);
-    WSP_GGML_API int wsp_ggml_cpu_has_avx512     (void);
-    WSP_GGML_API int wsp_ggml_cpu_has_avx512_vbmi(void);
-    WSP_GGML_API int wsp_ggml_cpu_has_avx512_vnni(void);
-    WSP_GGML_API int wsp_ggml_cpu_has_fma        (void);
-    WSP_GGML_API int wsp_ggml_cpu_has_neon       (void);
-    WSP_GGML_API int wsp_ggml_cpu_has_arm_fma    (void);
-    WSP_GGML_API int wsp_ggml_cpu_has_metal      (void);
-    WSP_GGML_API int wsp_ggml_cpu_has_f16c       (void);
-    WSP_GGML_API int wsp_ggml_cpu_has_fp16_va    (void);
-    WSP_GGML_API int wsp_ggml_cpu_has_wasm_simd  (void);
-    WSP_GGML_API int wsp_ggml_cpu_has_blas       (void);
-    WSP_GGML_API int wsp_ggml_cpu_has_cublas     (void);
-    WSP_GGML_API int wsp_ggml_cpu_has_clblast    (void);
-    WSP_GGML_API int wsp_ggml_cpu_has_gpublas    (void);
-    WSP_GGML_API int wsp_ggml_cpu_has_sse3       (void);
-    WSP_GGML_API int wsp_ggml_cpu_has_ssse3      (void);
-    WSP_GGML_API int wsp_ggml_cpu_has_vsx        (void);
+    struct wsp_ggml_threadpool;     // forward declaration, see ggml.c
-    //
-    // Internal types and functions exposed for tests and benchmarks
-    //
+    typedef struct wsp_ggml_threadpool * wsp_ggml_threadpool_t;
-#ifdef  __cplusplus
-// restrict not standard in C++
-#define WSP_GGML_RESTRICT
-#else
-#define WSP_GGML_RESTRICT restrict
-#endif
-    typedef void (*wsp_ggml_to_float_t)  (const void  * WSP_GGML_RESTRICT x, float * WSP_GGML_RESTRICT y, int k);
-    typedef void (*wsp_ggml_from_float_t)(const float * WSP_GGML_RESTRICT x, void  * WSP_GGML_RESTRICT y, int k);
-    typedef void (*wsp_ggml_vec_dot_t)   (const int n, float * WSP_GGML_RESTRICT s, const void * WSP_GGML_RESTRICT x, const void * WSP_GGML_RESTRICT y);
-    typedef struct {
-        const char      * type_name;
-        int               blck_size;
-        size_t            type_size;
-        bool              is_quantized;
-        wsp_ggml_to_float_t   to_float;
-        wsp_ggml_from_float_t from_float;
-        wsp_ggml_from_float_t from_float_reference;
-        wsp_ggml_vec_dot_t    vec_dot;
-        enum wsp_ggml_type    vec_dot_type;
-    } wsp_ggml_type_traits_t;
-    wsp_ggml_type_traits_t wsp_ggml_internal_get_type_traits(enum wsp_ggml_type type);
+    WSP_GGML_API struct wsp_ggml_threadpool_params wsp_ggml_threadpool_params_default(int n_threads);
+    WSP_GGML_API void                          wsp_ggml_threadpool_params_init   (struct wsp_ggml_threadpool_params * p, int n_threads);
+    WSP_GGML_API bool                          wsp_ggml_threadpool_params_match  (const struct wsp_ggml_threadpool_params * p0, const struct wsp_ggml_threadpool_params * p1);
 #ifdef  __cplusplus
 }