npm - whisper.rn - Versions diffs - 0.4.0-rc.3 → 0.4.0-rc.5 - Mend

whisper.rn 0.4.0-rc.3 → 0.4.0-rc.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (59) hide show

package/README.md +6 -6
package/android/build.gradle +4 -0
package/android/src/main/CMakeLists.txt +7 -0
package/android/src/main/java/com/rnwhisper/AudioUtils.java +0 -80
package/android/src/main/java/com/rnwhisper/RNWhisper.java +6 -1
package/android/src/main/java/com/rnwhisper/WhisperContext.java +53 -135
package/android/src/main/jni-utils.h +76 -0
package/android/src/main/jni.cpp +188 -109
package/cpp/README.md +1 -1
package/cpp/coreml/whisper-encoder-impl.h +1 -1
package/cpp/coreml/whisper-encoder.h +4 -0
package/cpp/coreml/whisper-encoder.mm +4 -2
package/cpp/ggml-alloc.c +451 -282
package/cpp/ggml-alloc.h +74 -8
package/cpp/ggml-backend-impl.h +112 -0
package/cpp/ggml-backend.c +1357 -0
package/cpp/ggml-backend.h +181 -0
package/cpp/ggml-impl.h +243 -0
package/cpp/{ggml-metal.metal → ggml-metal-whisper.metal} +1556 -329
package/cpp/ggml-metal.h +28 -1
package/cpp/ggml-metal.m +1128 -308
package/cpp/ggml-quants.c +7382 -0
package/cpp/ggml-quants.h +224 -0
package/cpp/ggml.c +3848 -5245
package/cpp/ggml.h +353 -155
package/cpp/rn-audioutils.cpp +68 -0
package/cpp/rn-audioutils.h +14 -0
package/cpp/rn-whisper-log.h +11 -0
package/cpp/rn-whisper.cpp +141 -59
package/cpp/rn-whisper.h +47 -15
package/cpp/whisper.cpp +1750 -964
package/cpp/whisper.h +97 -15
package/ios/RNWhisper.mm +15 -9
package/ios/RNWhisper.xcodeproj/project.xcworkspace/contents.xcworkspacedata +4 -0
package/ios/RNWhisper.xcodeproj/project.xcworkspace/xcshareddata/IDEWorkspaceChecks.plist +8 -0
package/ios/RNWhisper.xcodeproj/project.xcworkspace/xcuserdata/jhen.xcuserdatad/UserInterfaceState.xcuserstate +0 -0
package/ios/RNWhisper.xcodeproj/xcuserdata/jhen.xcuserdatad/xcschemes/xcschememanagement.plist +19 -0
package/ios/RNWhisperAudioUtils.h +0 -2
package/ios/RNWhisperAudioUtils.m +0 -56
package/ios/RNWhisperContext.h +8 -12
package/ios/RNWhisperContext.mm +132 -138
package/jest/mock.js +1 -1
package/lib/commonjs/NativeRNWhisper.js.map +1 -1
package/lib/commonjs/index.js +28 -9
package/lib/commonjs/index.js.map +1 -1
package/lib/commonjs/version.json +1 -1
package/lib/module/NativeRNWhisper.js.map +1 -1
package/lib/module/index.js +28 -9
package/lib/module/index.js.map +1 -1
package/lib/module/version.json +1 -1
package/lib/typescript/NativeRNWhisper.d.ts +7 -1
package/lib/typescript/NativeRNWhisper.d.ts.map +1 -1
package/lib/typescript/index.d.ts +7 -2
package/lib/typescript/index.d.ts.map +1 -1
package/package.json +6 -5
package/src/NativeRNWhisper.ts +8 -1
package/src/index.ts +29 -17
package/src/version.json +1 -1
package/whisper-rn.podspec +1 -2

package/cpp/ggml.h CHANGED Viewed

@@ -58,7 +58,8 @@
 //   {
 //       ...
 //
-//       struct wsp_ggml_cgraph gf = wsp_ggml_build_forward(f);
+//       struct wsp_ggml_cgraph * gf = wsp_ggml_new_graph(ctx);
+//       wsp_ggml_build_forward_expand(gf, f);
 //
 //       // set the input variable and parameter values
 //       wsp_ggml_set_f32(x, 2.0f);
@@ -213,15 +214,14 @@
 #define WSP_GGML_QNT_VERSION        2    // bump this on quantization format changes
 #define WSP_GGML_QNT_VERSION_FACTOR 1000 // do not change this
-#define WSP_GGML_MAX_DIMS          4
-#define WSP_GGML_MAX_NODES         4096
-#define WSP_GGML_MAX_PARAMS        256
-#define WSP_GGML_MAX_CONTEXTS      64
-#define WSP_GGML_MAX_SRC           6
-#define WSP_GGML_MAX_NAME          64
-#define WSP_GGML_MAX_OP_PARAMS     32
-#define WSP_GGML_DEFAULT_N_THREADS 4
+#define WSP_GGML_MAX_DIMS           4
+#define WSP_GGML_MAX_PARAMS         1024
+#define WSP_GGML_MAX_CONTEXTS       64
+#define WSP_GGML_MAX_SRC            6
+#define WSP_GGML_MAX_NAME           64
+#define WSP_GGML_MAX_OP_PARAMS      64
+#define WSP_GGML_DEFAULT_N_THREADS  4
+#define WSP_GGML_DEFAULT_GRAPH_SIZE 2048
 #if UINTPTR_MAX == 0xFFFFFFFF
     #define WSP_GGML_MEM_ALIGN 4
 #else
@@ -231,10 +231,11 @@
 #define WSP_GGML_EXIT_SUCCESS 0
 #define WSP_GGML_EXIT_ABORTED 1
-#define GGUF_MAGIC   0x46554747 // "GGUF"
-#define GGUF_VERSION 2
+#define WSP_GGUF_MAGIC "GGUF"
+#define WSP_GGUF_VERSION 3
-#define GGUF_DEFAULT_ALIGNMENT 32
+#define WSP_GGUF_DEFAULT_ALIGNMENT 32
 #define WSP_GGML_UNUSED(x) (void)(x)
@@ -243,11 +244,21 @@
 #define WSP_GGML_ASSERT(x) \
     do { \
         if (!(x)) { \
+            fflush(stdout); \
             fprintf(stderr, "WSP_GGML_ASSERT: %s:%d: %s\n", __FILE__, __LINE__, #x); \
+            wsp_ggml_print_backtrace(); \
             abort(); \
         } \
     } while (0)
+#ifndef NDEBUG
+#define WSP_GGML_UNREACHABLE() WSP_GGML_ASSERT(!"statement should not be reached")
+#elif defined(__GNUC__)
+#define WSP_GGML_UNREACHABLE() __builtin_unreachable()
+#else
+#define WSP_GGML_UNREACHABLE() ((void) 0)
+#endif
 // used to copy the number of elements and stride in bytes of tensors into local variables.
 // main purpose is to reduce code duplication and improve readability.
 //
@@ -272,6 +283,20 @@
     const type prefix##3 = (pointer)->array[3]; \
     WSP_GGML_UNUSED(prefix##3);
+#define WSP_GGML_TENSOR_UNARY_OP_LOCALS \
+    WSP_GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne) \
+    WSP_GGML_TENSOR_LOCALS(size_t,  nb0, src0, nb) \
+    WSP_GGML_TENSOR_LOCALS(int64_t, ne,  dst,  ne) \
+    WSP_GGML_TENSOR_LOCALS(size_t,  nb,  dst,  nb)
+#define WSP_GGML_TENSOR_BINARY_OP_LOCALS \
+    WSP_GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne) \
+    WSP_GGML_TENSOR_LOCALS(size_t,  nb0, src0, nb) \
+    WSP_GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne) \
+    WSP_GGML_TENSOR_LOCALS(size_t,  nb1, src1, nb) \
+    WSP_GGML_TENSOR_LOCALS(int64_t, ne,  dst,  ne) \
+    WSP_GGML_TENSOR_LOCALS(size_t,  nb,  dst,  nb)
 #ifdef  __cplusplus
 extern "C" {
 #endif
@@ -318,7 +343,7 @@ extern "C" {
         WSP_GGML_TYPE_COUNT,
     };
-    enum wsp_ggml_backend {
+    enum wsp_ggml_backend_type {
         WSP_GGML_BACKEND_CPU = 0,
         WSP_GGML_BACKEND_GPU = 10,
         WSP_GGML_BACKEND_GPU_SPLIT = 20,
@@ -370,6 +395,7 @@ extern "C" {
         WSP_GGML_OP_GROUP_NORM,
         WSP_GGML_OP_MUL_MAT,
+        WSP_GGML_OP_MUL_MAT_ID,
         WSP_GGML_OP_OUT_PROD,
         WSP_GGML_OP_SCALE,
@@ -391,13 +417,13 @@ extern "C" {
         WSP_GGML_OP_ROPE_BACK,
         WSP_GGML_OP_ALIBI,
         WSP_GGML_OP_CLAMP,
-        WSP_GGML_OP_CONV_1D,
-        WSP_GGML_OP_CONV_2D,
+        WSP_GGML_OP_CONV_TRANSPOSE_1D,
+        WSP_GGML_OP_IM2COL,
         WSP_GGML_OP_CONV_TRANSPOSE_2D,
         WSP_GGML_OP_POOL_1D,
         WSP_GGML_OP_POOL_2D,
         WSP_GGML_OP_UPSCALE, // nearest interpolate
+        WSP_GGML_OP_ARGSORT,
         WSP_GGML_OP_FLASH_ATTN,
         WSP_GGML_OP_FLASH_FF,
@@ -437,6 +463,9 @@ extern "C" {
         WSP_GGML_UNARY_OP_GELU,
         WSP_GGML_UNARY_OP_GELU_QUICK,
         WSP_GGML_UNARY_OP_SILU,
+        WSP_GGML_UNARY_OP_LEAKY,
+        WSP_GGML_UNARY_OP_COUNT,
     };
     enum wsp_ggml_object_type {
@@ -445,6 +474,12 @@ extern "C" {
         WSP_GGML_OBJECT_WORK_BUFFER
     };
+    enum wsp_ggml_log_level {
+        WSP_GGML_LOG_LEVEL_ERROR = 2,
+        WSP_GGML_LOG_LEVEL_WARN = 3,
+        WSP_GGML_LOG_LEVEL_INFO = 4
+    };
     // ggml object
     struct wsp_ggml_object {
         size_t offs;
@@ -461,14 +496,16 @@ extern "C" {
     // n-dimensional tensor
     struct wsp_ggml_tensor {
-        enum wsp_ggml_type    type;
-        enum wsp_ggml_backend backend;
+        enum wsp_ggml_type         type;
+        enum wsp_ggml_backend_type backend;
+        struct wsp_ggml_backend_buffer * buffer;
         int     n_dims;
         int64_t ne[WSP_GGML_MAX_DIMS]; // number of elements
         size_t  nb[WSP_GGML_MAX_DIMS]; // stride in bytes:
-                                   // nb[0] = sizeof(type)
-                                   // nb[1] = nb[0]   * ne[0] + padding
+                                   // nb[0] = wsp_ggml_type_size(type)
+                                   // nb[1] = nb[0]   * (ne[0] / wsp_ggml_blck_size(type)) + padding
                                    // nb[i] = nb[i-1] * ne[i-1]
         // compute data
@@ -496,7 +533,7 @@ extern "C" {
         void * extra; // extra things e.g. for ggml-cuda.cu
-        char padding[4];
+        char padding[12];
     };
     static const size_t WSP_GGML_TENSOR_SIZE = sizeof(struct wsp_ggml_tensor);
@@ -509,29 +546,35 @@ extern "C" {
         int n_threads;
-        // the `n_tasks` of nodes, 1:1 mapping to cgraph nodes
-        int n_tasks[WSP_GGML_MAX_NODES];
         // abort wsp_ggml_graph_compute when true
         bool (*abort_callback)(void * data);
         void * abort_callback_data;
     };
-    // next prime after WSP_GGML_MAX_NODES
-    // #define WSP_GGML_GRAPH_HASHTABLE_SIZE 4099
-    // next prime after WSP_GGML_MAX_NODES * 2 (nodes + leafs)
-    #define WSP_GGML_GRAPH_HASHTABLE_SIZE 8273
+    enum wsp_ggml_cgraph_eval_order {
+        WSP_GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT = 0,
+        WSP_GGML_CGRAPH_EVAL_ORDER_RIGHT_TO_LEFT,
+        WSP_GGML_CGRAPH_EVAL_ORDER_COUNT
+    };
+    struct wsp_ggml_hash_set {
+        size_t size;
+        struct wsp_ggml_tensor ** keys;
+    };
     // computation graph
     struct wsp_ggml_cgraph {
+        int size;
         int n_nodes;
         int n_leafs;
-        struct wsp_ggml_tensor * nodes[WSP_GGML_MAX_NODES];
-        struct wsp_ggml_tensor * grads[WSP_GGML_MAX_NODES];
-        struct wsp_ggml_tensor * leafs[WSP_GGML_MAX_NODES];
+        struct wsp_ggml_tensor ** nodes;
+        struct wsp_ggml_tensor ** grads;
+        struct wsp_ggml_tensor ** leafs;
+        struct wsp_ggml_hash_set visited_hash_table;
-        void * visited_hash_table[WSP_GGML_GRAPH_HASHTABLE_SIZE];
+        enum wsp_ggml_cgraph_eval_order order;
         // performance
         int     perf_runs;
@@ -539,8 +582,6 @@ extern "C" {
         int64_t perf_time_us;
     };
-    static const size_t WSP_GGML_GRAPH_SIZE = sizeof(struct wsp_ggml_cgraph);
     // scratch buffer
     struct wsp_ggml_scratch {
         size_t offs;
@@ -585,6 +626,8 @@ extern "C" {
     WSP_GGML_API int64_t wsp_ggml_cycles(void);
     WSP_GGML_API int64_t wsp_ggml_cycles_per_ms(void);
+    WSP_GGML_API void    wsp_ggml_print_backtrace(void);
     WSP_GGML_API void    wsp_ggml_numa_init(void); // call once for better performance on NUMA systems
     WSP_GGML_API bool    wsp_ggml_is_numa(void); // true if init detected that system has >1 NUMA node
@@ -605,6 +648,9 @@ extern "C" {
     WSP_GGML_API const char * wsp_ggml_op_name  (enum wsp_ggml_op   op);
     WSP_GGML_API const char * wsp_ggml_op_symbol(enum wsp_ggml_op   op);
+    WSP_GGML_API const char * wsp_ggml_unary_op_name(enum wsp_ggml_unary_op op);
+    WSP_GGML_API const char * wsp_ggml_op_desc(const struct wsp_ggml_tensor * t); // unary or op name
     WSP_GGML_API size_t  wsp_ggml_element_size(const struct wsp_ggml_tensor * tensor);
     WSP_GGML_API bool    wsp_ggml_is_quantized(enum wsp_ggml_type type);
@@ -674,18 +720,30 @@ extern "C" {
     WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_dup_tensor (struct wsp_ggml_context * ctx, const struct wsp_ggml_tensor * src);
     WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_view_tensor(struct wsp_ggml_context * ctx, struct wsp_ggml_tensor * src);
+    // Context tensor enumeration and lookup
+    WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_get_first_tensor(struct wsp_ggml_context * ctx);
+    WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_get_next_tensor (struct wsp_ggml_context * ctx, struct wsp_ggml_tensor * tensor);
     WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_get_tensor(struct wsp_ggml_context * ctx, const char * name);
     WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_set_zero(struct wsp_ggml_tensor * tensor);
     WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_set_i32 (struct wsp_ggml_tensor * tensor, int32_t value);
     WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_set_f32 (struct wsp_ggml_tensor * tensor, float value);
+    // Converts a flat index into coordinates
+    WSP_GGML_API void    wsp_ggml_unravel_index(const struct wsp_ggml_tensor * tensor, int64_t i, int64_t * i0, int64_t * i1, int64_t * i2, int64_t * i3);
     WSP_GGML_API int32_t wsp_ggml_get_i32_1d(const struct wsp_ggml_tensor * tensor, int i);
     WSP_GGML_API void    wsp_ggml_set_i32_1d(const struct wsp_ggml_tensor * tensor, int i, int32_t value);
+    WSP_GGML_API int32_t wsp_ggml_get_i32_nd(const struct wsp_ggml_tensor * tensor, int i0, int i1, int i2, int i3);
+    WSP_GGML_API void    wsp_ggml_set_i32_nd(const struct wsp_ggml_tensor * tensor, int i0, int i1, int i2, int i3, int32_t value);
     WSP_GGML_API float   wsp_ggml_get_f32_1d(const struct wsp_ggml_tensor * tensor, int i);
     WSP_GGML_API void    wsp_ggml_set_f32_1d(const struct wsp_ggml_tensor * tensor, int i, float value);
+    WSP_GGML_API float   wsp_ggml_get_f32_nd(const struct wsp_ggml_tensor * tensor, int i0, int i1, int i2, int i3);
+    WSP_GGML_API void    wsp_ggml_set_f32_nd(const struct wsp_ggml_tensor * tensor, int i0, int i1, int i2, int i3, float value);
     WSP_GGML_API void *  wsp_ggml_get_data    (const struct wsp_ggml_tensor * tensor);
     WSP_GGML_API float * wsp_ggml_get_data_f32(const struct wsp_ggml_tensor * tensor);
@@ -719,6 +777,12 @@ extern "C" {
             struct wsp_ggml_tensor  * a,
             struct wsp_ggml_tensor  * b);
+    WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_add_cast(
+            struct wsp_ggml_context * ctx,
+            struct wsp_ggml_tensor  * a,
+            struct wsp_ggml_tensor  * b,
+            enum   wsp_ggml_type      type);
     WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_add1(
             struct wsp_ggml_context * ctx,
             struct wsp_ggml_tensor  * a,
@@ -828,6 +892,7 @@ extern "C" {
             struct wsp_ggml_tensor  * a,
             struct wsp_ggml_tensor  * b);
+    // sums repetitions in a into shape of b
     WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_repeat_back(
             struct wsp_ggml_context * ctx,
             struct wsp_ggml_tensor  * a,
@@ -892,6 +957,10 @@ extern "C" {
             struct wsp_ggml_context * ctx,
             struct wsp_ggml_tensor  * a);
+    WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_leaky(
+            struct wsp_ggml_context * ctx,
+            struct wsp_ggml_tensor  * a);
     WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_relu_inplace(
             struct wsp_ggml_context * ctx,
             struct wsp_ggml_tensor  * a);
@@ -970,14 +1039,23 @@ extern "C" {
             struct wsp_ggml_tensor  * b,
             float                 eps);
-    // A: n columns, m rows
-    // B: n columns, p rows  (i.e. we transpose it internally)
-    // result is m columns, p rows
+    // A: k columns, n rows => [ne03, ne02, n, k]
+    // B: k columns, m rows  (i.e. we transpose it internally) => [ne03 * x, ne02 * y, m, k]
+    // result is n columns, m rows => [ne03 * x, ne02 * y, m, n]
     WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_mul_mat(
             struct wsp_ggml_context * ctx,
             struct wsp_ggml_tensor  * a,
             struct wsp_ggml_tensor  * b);
+    // indirect matrix multiplication
+    //  wsp_ggml_mul_mat_id(ctx, as, ids, id, b) ~= wsp_ggml_mul_mat(as[ids[id]], b)
+    WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_mul_mat_id(
+            struct wsp_ggml_context * ctx,
+            struct wsp_ggml_tensor  * as[],
+            struct wsp_ggml_tensor  * ids,
+            int                   id,
+            struct wsp_ggml_tensor  * b);
     // A: m columns, n rows,
     // B: p columns, n rows,
     // result is m columns, p rows
@@ -1049,7 +1127,6 @@ extern "C" {
             size_t                nb1,
             size_t                offset);
     // a -> b, return view(b)
     WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_cpy(
             struct wsp_ggml_context * ctx,
@@ -1072,6 +1149,33 @@ extern "C" {
             struct wsp_ggml_context * ctx,
             struct wsp_ggml_tensor  * a);
+    // make contiguous, with new shape
+    WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_cont_1d(
+            struct wsp_ggml_context * ctx,
+            struct wsp_ggml_tensor  * a,
+            int64_t               ne0);
+    WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_cont_2d(
+            struct wsp_ggml_context * ctx,
+            struct wsp_ggml_tensor  * a,
+            int64_t               ne0,
+            int64_t               ne1);
+    WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_cont_3d(
+            struct wsp_ggml_context * ctx,
+            struct wsp_ggml_tensor  * a,
+            int64_t               ne0,
+            int64_t               ne1,
+            int64_t               ne2);
+    WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_cont_4d(
+            struct wsp_ggml_context * ctx,
+            struct wsp_ggml_tensor  * a,
+            int64_t               ne0,
+            int64_t               ne1,
+            int64_t               ne2,
+            int64_t               ne3);
     // return view(a), b specifies the new shape
     // TODO: when we start computing gradient, make a copy instead of view
     WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_reshape(
@@ -1207,6 +1311,14 @@ extern "C" {
             struct wsp_ggml_context * ctx,
             struct wsp_ggml_tensor  * a);
+    // fused soft_max(a*scale + mask)
+    // mask is optional
+    WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_soft_max_ext(
+            struct wsp_ggml_context * ctx,
+            struct wsp_ggml_tensor  * a,
+            struct wsp_ggml_tensor  * mask,
+            float                 scale);
     WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_soft_max_back(
             struct wsp_ggml_context * ctx,
             struct wsp_ggml_tensor  * a,
@@ -1219,14 +1331,15 @@ extern "C" {
             struct wsp_ggml_tensor  * b);
     // rotary position embedding
-    // if mode & 1 == 1, skip n_past elements
+    // if mode & 1 == 1, skip n_past elements (DEPRECATED)
     // if mode & 2 == 1, GPT-NeoX style
     // if mode & 4 == 1, ChatGLM style
-    // TODO: avoid creating a new tensor every time
+    //
+    // b is an int32 vector with size a->ne[2], it contains the positions
     WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_rope(
             struct wsp_ggml_context * ctx,
             struct wsp_ggml_tensor  * a,
-            int                   n_past,
+            struct wsp_ggml_tensor  * b,
             int                   n_dims,
             int                   mode,
             int                   n_ctx);
@@ -1235,7 +1348,7 @@ extern "C" {
     WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_rope_inplace(
             struct wsp_ggml_context * ctx,
             struct wsp_ggml_tensor  * a,
-            int                   n_past,
+            struct wsp_ggml_tensor  * b,
             int                   n_dims,
             int                   mode,
             int                   n_ctx);
@@ -1244,29 +1357,43 @@ extern "C" {
     WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_rope_custom(
             struct wsp_ggml_context * ctx,
             struct wsp_ggml_tensor  * a,
-            int                   n_past,
+            struct wsp_ggml_tensor  * b,
             int                   n_dims,
             int                   mode,
             int                   n_ctx,
+            int                   n_orig_ctx,
             float                 freq_base,
-            float                 freq_scale);
+            float                 freq_scale,
+            float                 ext_factor,
+            float                 attn_factor,
+            float                 beta_fast,
+            float                 beta_slow);
     // in-place, returns view(a)
     WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_rope_custom_inplace(
             struct wsp_ggml_context * ctx,
             struct wsp_ggml_tensor  * a,
-            int                   n_past,
+            struct wsp_ggml_tensor  * b,
             int                   n_dims,
             int                   mode,
             int                   n_ctx,
+            int                   n_orig_ctx,
             float                 freq_base,
-            float                 freq_scale);
+            float                 freq_scale,
+            float                 ext_factor,
+            float                 attn_factor,
+            float                 beta_fast,
+            float                 beta_slow);
+    // compute correction dims for YaRN RoPE scaling
+    void wsp_ggml_rope_yarn_corr_dims(
+        int n_dims, int n_orig_ctx, float freq_base, float beta_fast, float beta_slow, float dims[2]);
     // xPos RoPE, in-place, returns view(a)
     WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_rope_xpos_inplace(
             struct wsp_ggml_context * ctx,
             struct wsp_ggml_tensor  * a,
-            int                   n_past,
+            struct wsp_ggml_tensor  * b,
             int                   n_dims,
             float                 base,
             bool                  down);
@@ -1276,18 +1403,23 @@ extern "C" {
     WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_rope_back(
             struct wsp_ggml_context * ctx,
             struct wsp_ggml_tensor  * a,
-            int                   n_past,
+            struct wsp_ggml_tensor  * b,
             int                   n_dims,
             int                   mode,
             int                   n_ctx,
+            int                   n_orig_ctx,
             float                 freq_base,
             float                 freq_scale,
+            float                 ext_factor,
+            float                 attn_factor,
+            float                 beta_fast,
+            float                 beta_slow,
             float                 xpos_base,
             bool                  xpos_down);
     // alibi position embedding
     // in-place, returns view(a)
-    struct wsp_ggml_tensor * wsp_ggml_alibi(
+    WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_alibi(
             struct wsp_ggml_context * ctx,
             struct wsp_ggml_tensor  * a,
             int                   n_past,
@@ -1296,12 +1428,24 @@ extern "C" {
     // clamp
     // in-place, returns view(a)
-    struct wsp_ggml_tensor * wsp_ggml_clamp(
+    WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_clamp(
             struct wsp_ggml_context * ctx,
             struct wsp_ggml_tensor  * a,
             float                 min,
             float                 max);
+    WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_im2col(
+            struct wsp_ggml_context * ctx,
+            struct wsp_ggml_tensor  * a,
+            struct wsp_ggml_tensor  * b,
+            int                  s0,
+            int                  s1,
+            int                  p0,
+            int                  p1,
+            int                  d0,
+            int                  d1,
+            bool                 is_2D);
     WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_conv_1d(
             struct wsp_ggml_context * ctx,
             struct wsp_ggml_tensor  * a,
@@ -1319,6 +1463,14 @@ extern "C" {
             int                   s,
             int                   d);
+    WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_conv_transpose_1d(
+            struct wsp_ggml_context * ctx,
+            struct wsp_ggml_tensor  * a,
+            struct wsp_ggml_tensor  * b,
+            int                   s0,
+            int                   p0,
+            int                   d0);
     WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_conv_2d(
             struct wsp_ggml_context * ctx,
             struct wsp_ggml_tensor  * a,
@@ -1377,6 +1529,8 @@ extern "C" {
             int                   s0, // stride
             int                   p0); // padding
+    // the result will have 2*p0 padding for the first dimension
+    // and 2*p1 padding for the second dimension
     WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_pool_2d(
             struct wsp_ggml_context * ctx,
             struct wsp_ggml_tensor  * a,
@@ -1385,8 +1539,8 @@ extern "C" {
             int                   k1,
             int                   s0,
             int                   s1,
-            int                   p0,
-            int                   p1);
+            float                 p0,
+            float                 p1);
     // nearest interpolate
     // used in stable-diffusion
@@ -1395,6 +1549,23 @@ extern "C" {
             struct wsp_ggml_tensor  * a,
             int                   scale_factor);
+    // sort rows
+    enum wsp_ggml_sort_order {
+        WSP_GGML_SORT_ASC,
+        WSP_GGML_SORT_DESC,
+    };
+    WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_argsort(
+            struct wsp_ggml_context * ctx,
+            struct wsp_ggml_tensor  * a,
+            enum wsp_ggml_sort_order  order);
+    // top k elements per row
+    WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_top_k(
+            struct wsp_ggml_context * ctx,
+            struct wsp_ggml_tensor  * a,
+            int                   k);
     WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_flash_attn(
             struct wsp_ggml_context * ctx,
             struct wsp_ggml_tensor  * q,
@@ -1456,7 +1627,6 @@ extern "C" {
             int                   kh);
     // used in sam
     WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_add_rel_pos(
             struct wsp_ggml_context * ctx,
             struct wsp_ggml_tensor  * a,
@@ -1627,19 +1797,22 @@ extern "C" {
     WSP_GGML_API void wsp_ggml_build_forward_expand (struct wsp_ggml_cgraph * cgraph, struct wsp_ggml_tensor * tensor);
     WSP_GGML_API void wsp_ggml_build_backward_expand(struct wsp_ggml_context * ctx, struct wsp_ggml_cgraph * gf, struct wsp_ggml_cgraph * gb, bool keep);
-    WSP_GGML_API struct wsp_ggml_cgraph wsp_ggml_build_forward (struct wsp_ggml_tensor * tensor);
-    WSP_GGML_API struct wsp_ggml_cgraph wsp_ggml_build_backward(struct wsp_ggml_context * ctx, struct wsp_ggml_cgraph * gf, bool keep);
     // graph allocation in a context
-    WSP_GGML_API struct wsp_ggml_cgraph * wsp_ggml_new_graph        (struct wsp_ggml_context * ctx);
-    WSP_GGML_API struct wsp_ggml_cgraph * wsp_ggml_build_forward_ctx(struct wsp_ggml_context * ctx, struct wsp_ggml_tensor * tensor);
+    WSP_GGML_API struct wsp_ggml_cgraph * wsp_ggml_new_graph         (struct wsp_ggml_context * ctx); // size = WSP_GGML_DEFAULT_GRAPH_SIZE, grads = false
+    WSP_GGML_API struct wsp_ggml_cgraph * wsp_ggml_new_graph_custom  (struct wsp_ggml_context * ctx, size_t size, bool grads);
+    WSP_GGML_API struct wsp_ggml_cgraph * wsp_ggml_graph_dup         (struct wsp_ggml_context * ctx, struct wsp_ggml_cgraph * cgraph);
+    WSP_GGML_API struct wsp_ggml_cgraph   wsp_ggml_graph_view        (struct wsp_ggml_cgraph * cgraph, int i0, int i1);
+    WSP_GGML_API void                 wsp_ggml_graph_cpy         (struct wsp_ggml_cgraph * src, struct wsp_ggml_cgraph * dst);
+    WSP_GGML_API void                 wsp_ggml_graph_reset       (struct wsp_ggml_cgraph * cgraph);  // zero grads
+    WSP_GGML_API void                 wsp_ggml_graph_clear       (struct wsp_ggml_cgraph * cgraph);
     WSP_GGML_API size_t wsp_ggml_graph_overhead(void);
+    WSP_GGML_API size_t wsp_ggml_graph_overhead_custom(size_t size, bool grads);
     // wsp_ggml_graph_plan() has to be called before wsp_ggml_graph_compute()
     // when plan.work_size > 0, caller must allocate memory for plan.work_data
     WSP_GGML_API struct wsp_ggml_cplan wsp_ggml_graph_plan   (struct wsp_ggml_cgraph * cgraph, int n_threads /*= WSP_GGML_DEFAULT_N_THREADS*/);
-    WSP_GGML_API               int wsp_ggml_graph_compute(struct wsp_ggml_cgraph * cgraph, struct wsp_ggml_cplan * cplan);
-    WSP_GGML_API              void wsp_ggml_graph_reset  (struct wsp_ggml_cgraph * cgraph);
+    WSP_GGML_API int               wsp_ggml_graph_compute(struct wsp_ggml_cgraph * cgraph, struct wsp_ggml_cplan * cplan);
     // same as wsp_ggml_graph_compute() but the work data is allocated as a part of the context
     // note: the drawback of this API is that you must have ensured that the context has enough memory for the work data
@@ -1647,8 +1820,8 @@ extern "C" {
     WSP_GGML_API struct wsp_ggml_tensor * wsp_ggml_graph_get_tensor(struct wsp_ggml_cgraph * cgraph, const char * name);
-    WSP_GGML_API void               wsp_ggml_graph_export(const struct wsp_ggml_cgraph * cgraph, const char * fname);
-    WSP_GGML_API struct wsp_ggml_cgraph wsp_ggml_graph_import(const char * fname, struct wsp_ggml_context ** ctx_data, struct wsp_ggml_context ** ctx_eval);
+    WSP_GGML_API void                 wsp_ggml_graph_export(const struct wsp_ggml_cgraph * cgraph, const char * fname);
+    WSP_GGML_API struct wsp_ggml_cgraph * wsp_ggml_graph_import(const char * fname, struct wsp_ggml_context ** ctx_data, struct wsp_ggml_context ** ctx_eval);
     // print info and performance information for the graph
     WSP_GGML_API void wsp_ggml_graph_print(const struct wsp_ggml_cgraph * cgraph);
@@ -1656,6 +1829,16 @@ extern "C" {
     // dump the graph into a file using the dot format
     WSP_GGML_API void wsp_ggml_graph_dump_dot(const struct wsp_ggml_cgraph * gb, const struct wsp_ggml_cgraph * gf, const char * filename);
+    // build gradient checkpointing backward graph gb for gf using provided checkpoints
+    // gb_tmp will contain original backward graph with rewritten backward process nodes,
+    // but without the second forward pass nodes.
+    WSP_GGML_API void wsp_ggml_build_backward_gradient_checkpointing(
+            struct wsp_ggml_context   * ctx,
+            struct wsp_ggml_cgraph    * gf,
+            struct wsp_ggml_cgraph    * gb,
+            struct wsp_ggml_cgraph    * gb_tmp,
+            struct wsp_ggml_tensor  * * checkpoints,
+            int                     n_checkpoints);
     //
     // optimization
     //
@@ -1682,6 +1865,7 @@ extern "C" {
         WSP_GGML_OPT_NO_CONTEXT,
         WSP_GGML_OPT_INVALID_WOLFE,
         WSP_GGML_OPT_FAIL,
+        WSP_GGML_OPT_CANCEL,
         WSP_GGML_LINESEARCH_FAIL = -128,
         WSP_GGML_LINESEARCH_MINIMUM_STEP,
@@ -1690,7 +1874,8 @@ extern "C" {
         WSP_GGML_LINESEARCH_INVALID_PARAMETERS,
     };
-    typedef void (*wsp_ggml_opt_callback)(void * data, float * sched);
+    typedef void (*wsp_ggml_opt_callback)(void * data, int accum_step, float * sched, bool * cancel);
+    typedef void (*wsp_ggml_log_callback)(enum wsp_ggml_log_level level, const char * text, void * user_data);
     // optimization parameters
     //
@@ -1699,6 +1884,8 @@ extern "C" {
     struct wsp_ggml_opt_params {
         enum wsp_ggml_opt_type type;
+        size_t graph_size;
         int n_threads;
         // delta-based convergence test
@@ -1721,6 +1908,8 @@ extern "C" {
         bool print_forward_graph;
         bool print_backward_graph;
+        int n_gradient_accumulation;
         // ADAM parameters
         struct {
             int n_iter;
@@ -1766,6 +1955,7 @@ extern "C" {
         float loss_after;
         struct {
+            struct wsp_ggml_tensor * g;  // current gradient
             struct wsp_ggml_tensor * m;  // first moment
             struct wsp_ggml_tensor * v;  // second moment
             struct wsp_ggml_tensor * pf; // past function values
@@ -1829,134 +2019,142 @@ extern "C" {
     // quantization
     //
-    WSP_GGML_API size_t wsp_ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist);
-    WSP_GGML_API size_t wsp_ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist);
-    WSP_GGML_API size_t wsp_ggml_quantize_q5_0(const float * src, void * dst, int n, int k, int64_t * hist);
-    WSP_GGML_API size_t wsp_ggml_quantize_q5_1(const float * src, void * dst, int n, int k, int64_t * hist);
-    WSP_GGML_API size_t wsp_ggml_quantize_q8_0(const float * src, void * dst, int n, int k, int64_t * hist);
+    // TODO: these would probably get removed in favor of the more general wsp_ggml_wsp_quantize_chunk
+    WSP_GGML_API size_t wsp_ggml_wsp_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist);
+    WSP_GGML_API size_t wsp_ggml_wsp_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist);
+    WSP_GGML_API size_t wsp_ggml_wsp_quantize_q5_0(const float * src, void * dst, int n, int k, int64_t * hist);
+    WSP_GGML_API size_t wsp_ggml_wsp_quantize_q5_1(const float * src, void * dst, int n, int k, int64_t * hist);
+    WSP_GGML_API size_t wsp_ggml_wsp_quantize_q8_0(const float * src, void * dst, int n, int k, int64_t * hist);
+    WSP_GGML_API size_t wsp_ggml_wsp_quantize_q2_K(const float * src, void * dst, int n, int k, int64_t * hist);
+    WSP_GGML_API size_t wsp_ggml_wsp_quantize_q3_K(const float * src, void * dst, int n, int k, int64_t * hist);
+    WSP_GGML_API size_t wsp_ggml_wsp_quantize_q4_K(const float * src, void * dst, int n, int k, int64_t * hist);
+    WSP_GGML_API size_t wsp_ggml_wsp_quantize_q5_K(const float * src, void * dst, int n, int k, int64_t * hist);
+    WSP_GGML_API size_t wsp_ggml_wsp_quantize_q6_K(const float * src, void * dst, int n, int k, int64_t * hist);
-    WSP_GGML_API size_t wsp_ggml_quantize_chunk(enum wsp_ggml_type type, const float * src, void * dst, int start, int n, int64_t * hist);
+    WSP_GGML_API size_t wsp_ggml_wsp_quantize_chunk(enum wsp_ggml_type type, const float * src, void * dst, int start, int n, int64_t * hist);
     //
     // gguf
     //
-    enum gguf_type {
-        GGUF_TYPE_UINT8   = 0,
-        GGUF_TYPE_INT8    = 1,
-        GGUF_TYPE_UINT16  = 2,
-        GGUF_TYPE_INT16   = 3,
-        GGUF_TYPE_UINT32  = 4,
-        GGUF_TYPE_INT32   = 5,
-        GGUF_TYPE_FLOAT32 = 6,
-        GGUF_TYPE_BOOL    = 7,
-        GGUF_TYPE_STRING  = 8,
-        GGUF_TYPE_ARRAY   = 9,
-        GGUF_TYPE_UINT64  = 10,
-        GGUF_TYPE_INT64   = 11,
-        GGUF_TYPE_FLOAT64 = 12,
-        GGUF_TYPE_COUNT,       // marks the end of the enum
+    enum wsp_gguf_type {
+        WSP_GGUF_TYPE_UINT8   = 0,
+        WSP_GGUF_TYPE_INT8    = 1,
+        WSP_GGUF_TYPE_UINT16  = 2,
+        WSP_GGUF_TYPE_INT16   = 3,
+        WSP_GGUF_TYPE_UINT32  = 4,
+        WSP_GGUF_TYPE_INT32   = 5,
+        WSP_GGUF_TYPE_FLOAT32 = 6,
+        WSP_GGUF_TYPE_BOOL    = 7,
+        WSP_GGUF_TYPE_STRING  = 8,
+        WSP_GGUF_TYPE_ARRAY   = 9,
+        WSP_GGUF_TYPE_UINT64  = 10,
+        WSP_GGUF_TYPE_INT64   = 11,
+        WSP_GGUF_TYPE_FLOAT64 = 12,
+        WSP_GGUF_TYPE_COUNT,       // marks the end of the enum
     };
-    struct gguf_context;
+    struct wsp_gguf_context;
-    struct gguf_init_params {
+    struct wsp_gguf_init_params {
         bool no_alloc;
         // if not NULL, create a wsp_ggml_context and allocate the tensor data in it
         struct wsp_ggml_context ** ctx;
     };
-    WSP_GGML_API struct gguf_context * gguf_init_empty(void);
-    WSP_GGML_API struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_params params);
-    //WSP_GGML_API struct gguf_context * gguf_init_from_buffer(..);
-    WSP_GGML_API void gguf_free(struct gguf_context * ctx);
-    WSP_GGML_API const char * gguf_type_name(enum gguf_type type);
-    WSP_GGML_API int    gguf_get_version    (const struct gguf_context * ctx);
-    WSP_GGML_API size_t gguf_get_alignment  (const struct gguf_context * ctx);
-    WSP_GGML_API size_t gguf_get_data_offset(const struct gguf_context * ctx);
-    WSP_GGML_API void * gguf_get_data       (const struct gguf_context * ctx);
-    WSP_GGML_API int          gguf_get_n_kv(const struct gguf_context * ctx);
-    WSP_GGML_API int          gguf_find_key(const struct gguf_context * ctx, const char * key);
-    WSP_GGML_API const char * gguf_get_key (const struct gguf_context * ctx, int i);
-    WSP_GGML_API enum gguf_type gguf_get_kv_type (const struct gguf_context * ctx, int i);
-    WSP_GGML_API enum gguf_type gguf_get_arr_type(const struct gguf_context * ctx, int i);
-    // results are undefined if the wrong type is used for the key
-    WSP_GGML_API uint8_t      gguf_get_val_u8  (const struct gguf_context * ctx, int i);
-    WSP_GGML_API int8_t       gguf_get_val_i8  (const struct gguf_context * ctx, int i);
-    WSP_GGML_API uint16_t     gguf_get_val_u16 (const struct gguf_context * ctx, int i);
-    WSP_GGML_API int16_t      gguf_get_val_i16 (const struct gguf_context * ctx, int i);
-    WSP_GGML_API uint32_t     gguf_get_val_u32 (const struct gguf_context * ctx, int i);
-    WSP_GGML_API int32_t      gguf_get_val_i32 (const struct gguf_context * ctx, int i);
-    WSP_GGML_API float        gguf_get_val_f32 (const struct gguf_context * ctx, int i);
-    WSP_GGML_API uint64_t     gguf_get_val_u64 (const struct gguf_context * ctx, int i);
-    WSP_GGML_API int64_t      gguf_get_val_i64 (const struct gguf_context * ctx, int i);
-    WSP_GGML_API double       gguf_get_val_f64 (const struct gguf_context * ctx, int i);
-    WSP_GGML_API bool         gguf_get_val_bool(const struct gguf_context * ctx, int i);
-    WSP_GGML_API const char * gguf_get_val_str (const struct gguf_context * ctx, int i);
-    WSP_GGML_API int          gguf_get_arr_n   (const struct gguf_context * ctx, int i);
-    WSP_GGML_API const void * gguf_get_arr_data(const struct gguf_context * ctx, int i);
-    WSP_GGML_API const char * gguf_get_arr_str (const struct gguf_context * ctx, int key_id, int i);
-    WSP_GGML_API int    gguf_get_n_tensors    (const struct gguf_context * ctx);
-    WSP_GGML_API int    gguf_find_tensor      (const struct gguf_context * ctx, const char * name);
-    WSP_GGML_API size_t gguf_get_tensor_offset(const struct gguf_context * ctx, int i);
-    WSP_GGML_API char * gguf_get_tensor_name  (const struct gguf_context * ctx, int i);
+    WSP_GGML_API struct wsp_gguf_context * wsp_gguf_init_empty(void);
+    WSP_GGML_API struct wsp_gguf_context * wsp_gguf_init_from_file(const char * fname, struct wsp_gguf_init_params params);
+    //WSP_GGML_API struct wsp_gguf_context * wsp_gguf_init_from_buffer(..);
+    WSP_GGML_API void wsp_gguf_free(struct wsp_gguf_context * ctx);
+    WSP_GGML_API const char * wsp_gguf_type_name(enum wsp_gguf_type type);
+    WSP_GGML_API int    wsp_gguf_get_version    (const struct wsp_gguf_context * ctx);
+    WSP_GGML_API size_t wsp_gguf_get_alignment  (const struct wsp_gguf_context * ctx);
+    WSP_GGML_API size_t wsp_gguf_get_data_offset(const struct wsp_gguf_context * ctx);
+    WSP_GGML_API void * wsp_gguf_get_data       (const struct wsp_gguf_context * ctx);
+    WSP_GGML_API int          wsp_gguf_get_n_kv(const struct wsp_gguf_context * ctx);
+    WSP_GGML_API int          wsp_gguf_find_key(const struct wsp_gguf_context * ctx, const char * key);
+    WSP_GGML_API const char * wsp_gguf_get_key (const struct wsp_gguf_context * ctx, int key_id);
+    WSP_GGML_API enum wsp_gguf_type wsp_gguf_get_kv_type (const struct wsp_gguf_context * ctx, int key_id);
+    WSP_GGML_API enum wsp_gguf_type wsp_gguf_get_arr_type(const struct wsp_gguf_context * ctx, int key_id);
+    // will abort if the wrong type is used for the key
+    WSP_GGML_API uint8_t      wsp_gguf_get_val_u8  (const struct wsp_gguf_context * ctx, int key_id);
+    WSP_GGML_API int8_t       wsp_gguf_get_val_i8  (const struct wsp_gguf_context * ctx, int key_id);
+    WSP_GGML_API uint16_t     wsp_gguf_get_val_u16 (const struct wsp_gguf_context * ctx, int key_id);
+    WSP_GGML_API int16_t      wsp_gguf_get_val_i16 (const struct wsp_gguf_context * ctx, int key_id);
+    WSP_GGML_API uint32_t     wsp_gguf_get_val_u32 (const struct wsp_gguf_context * ctx, int key_id);
+    WSP_GGML_API int32_t      wsp_gguf_get_val_i32 (const struct wsp_gguf_context * ctx, int key_id);
+    WSP_GGML_API float        wsp_gguf_get_val_f32 (const struct wsp_gguf_context * ctx, int key_id);
+    WSP_GGML_API uint64_t     wsp_gguf_get_val_u64 (const struct wsp_gguf_context * ctx, int key_id);
+    WSP_GGML_API int64_t      wsp_gguf_get_val_i64 (const struct wsp_gguf_context * ctx, int key_id);
+    WSP_GGML_API double       wsp_gguf_get_val_f64 (const struct wsp_gguf_context * ctx, int key_id);
+    WSP_GGML_API bool         wsp_gguf_get_val_bool(const struct wsp_gguf_context * ctx, int key_id);
+    WSP_GGML_API const char * wsp_gguf_get_val_str (const struct wsp_gguf_context * ctx, int key_id);
+    WSP_GGML_API const void * wsp_gguf_get_val_data(const struct wsp_gguf_context * ctx, int key_id);
+    WSP_GGML_API int          wsp_gguf_get_arr_n   (const struct wsp_gguf_context * ctx, int key_id);
+    WSP_GGML_API const void * wsp_gguf_get_arr_data(const struct wsp_gguf_context * ctx, int key_id);
+    WSP_GGML_API const char * wsp_gguf_get_arr_str (const struct wsp_gguf_context * ctx, int key_id, int i);
+    WSP_GGML_API int    wsp_gguf_get_n_tensors    (const struct wsp_gguf_context * ctx);
+    WSP_GGML_API int    wsp_gguf_find_tensor      (const struct wsp_gguf_context * ctx, const char * name);
+    WSP_GGML_API size_t wsp_gguf_get_tensor_offset(const struct wsp_gguf_context * ctx, int i);
+    WSP_GGML_API char * wsp_gguf_get_tensor_name  (const struct wsp_gguf_context * ctx, int i);
     // overrides existing values or adds a new one
-    WSP_GGML_API void gguf_set_val_u8  (struct gguf_context * ctx, const char * key, uint8_t  val);
-    WSP_GGML_API void gguf_set_val_i8  (struct gguf_context * ctx, const char * key, int8_t   val);
-    WSP_GGML_API void gguf_set_val_u16 (struct gguf_context * ctx, const char * key, uint16_t val);
-    WSP_GGML_API void gguf_set_val_i16 (struct gguf_context * ctx, const char * key, int16_t  val);
-    WSP_GGML_API void gguf_set_val_u32 (struct gguf_context * ctx, const char * key, uint32_t val);
-    WSP_GGML_API void gguf_set_val_i32 (struct gguf_context * ctx, const char * key, int32_t  val);
-    WSP_GGML_API void gguf_set_val_f32 (struct gguf_context * ctx, const char * key, float    val);
-    WSP_GGML_API void gguf_set_val_u64 (struct gguf_context * ctx, const char * key, uint64_t val);
-    WSP_GGML_API void gguf_set_val_i64 (struct gguf_context * ctx, const char * key, int64_t  val);
-    WSP_GGML_API void gguf_set_val_f64 (struct gguf_context * ctx, const char * key, double   val);
-    WSP_GGML_API void gguf_set_val_bool(struct gguf_context * ctx, const char * key, bool     val);
-    WSP_GGML_API void gguf_set_val_str (struct gguf_context * ctx, const char * key, const char * val);
-    WSP_GGML_API void gguf_set_arr_data(struct gguf_context * ctx, const char * key, enum gguf_type type, const void * data, int n);
-    WSP_GGML_API void gguf_set_arr_str (struct gguf_context * ctx, const char * key, const char ** data, int n);
+    WSP_GGML_API void wsp_gguf_set_val_u8  (struct wsp_gguf_context * ctx, const char * key, uint8_t  val);
+    WSP_GGML_API void wsp_gguf_set_val_i8  (struct wsp_gguf_context * ctx, const char * key, int8_t   val);
+    WSP_GGML_API void wsp_gguf_set_val_u16 (struct wsp_gguf_context * ctx, const char * key, uint16_t val);
+    WSP_GGML_API void wsp_gguf_set_val_i16 (struct wsp_gguf_context * ctx, const char * key, int16_t  val);
+    WSP_GGML_API void wsp_gguf_set_val_u32 (struct wsp_gguf_context * ctx, const char * key, uint32_t val);
+    WSP_GGML_API void wsp_gguf_set_val_i32 (struct wsp_gguf_context * ctx, const char * key, int32_t  val);
+    WSP_GGML_API void wsp_gguf_set_val_f32 (struct wsp_gguf_context * ctx, const char * key, float    val);
+    WSP_GGML_API void wsp_gguf_set_val_u64 (struct wsp_gguf_context * ctx, const char * key, uint64_t val);
+    WSP_GGML_API void wsp_gguf_set_val_i64 (struct wsp_gguf_context * ctx, const char * key, int64_t  val);
+    WSP_GGML_API void wsp_gguf_set_val_f64 (struct wsp_gguf_context * ctx, const char * key, double   val);
+    WSP_GGML_API void wsp_gguf_set_val_bool(struct wsp_gguf_context * ctx, const char * key, bool     val);
+    WSP_GGML_API void wsp_gguf_set_val_str (struct wsp_gguf_context * ctx, const char * key, const char * val);
+    WSP_GGML_API void wsp_gguf_set_arr_data(struct wsp_gguf_context * ctx, const char * key, enum wsp_gguf_type type, const void * data, int n);
+    WSP_GGML_API void wsp_gguf_set_arr_str (struct wsp_gguf_context * ctx, const char * key, const char ** data, int n);
     // set or add KV pairs from another context
-    WSP_GGML_API void gguf_set_kv(struct gguf_context * ctx, struct gguf_context * src);
+    WSP_GGML_API void wsp_gguf_set_kv(struct wsp_gguf_context * ctx, struct wsp_gguf_context * src);
     // manage tensor info
-    WSP_GGML_API void gguf_add_tensor(struct gguf_context * ctx, const struct wsp_ggml_tensor * tensor);
-    WSP_GGML_API void gguf_set_tensor_type(struct gguf_context * ctx, const char * name, enum wsp_ggml_type type);
-    WSP_GGML_API void gguf_set_tensor_data(struct gguf_context * ctx, const char * name, const void * data, size_t size);
+    WSP_GGML_API void wsp_gguf_add_tensor(struct wsp_gguf_context * ctx, const struct wsp_ggml_tensor * tensor);
+    WSP_GGML_API void wsp_gguf_set_tensor_type(struct wsp_gguf_context * ctx, const char * name, enum wsp_ggml_type type);
+    WSP_GGML_API void wsp_gguf_set_tensor_data(struct wsp_gguf_context * ctx, const char * name, const void * data, size_t size);
     // writing gguf files can be done in 2 ways:
     //
-    // - write the entire gguf_context to a binary file in a single pass:
+    // - write the entire wsp_gguf_context to a binary file in a single pass:
     //
-    //   gguf_write_to_file(ctx, fname);
+    //   wsp_gguf_write_to_file(ctx, fname);
     //
     // - first prepare a file with a placeholder for the meta data, write the tensor data, then write the meta data:
     //
     //   FILE * f = fopen(fname, "wb");
-    //   fseek(f, gguf_get_meta_size(ctx), SEEK_SET);
+    //   fseek(f, wsp_gguf_get_meta_size(ctx), SEEK_SET);
     //   fwrite(f, ...);
-    //   void * data = gguf_meta_get_meta_data(ctx);
+    //   void * data = wsp_gguf_meta_get_meta_data(ctx);
     //   fseek(f, 0, SEEK_SET);
-    //   fwrite(f, data, gguf_get_meta_size(ctx));
+    //   fwrite(f, data, wsp_gguf_get_meta_size(ctx));
     //   free(data);
     //   fclose(f);
     //
     // write the entire context to a binary file
-    WSP_GGML_API void gguf_write_to_file(const struct gguf_context * ctx, const char * fname, bool only_meta);
+    WSP_GGML_API void wsp_gguf_write_to_file(const struct wsp_gguf_context * ctx, const char * fname, bool only_meta);
     // get the size in bytes of the meta data (header, kv pairs, tensor info) including padding
-    WSP_GGML_API size_t gguf_get_meta_size(const struct gguf_context * ctx);
-    WSP_GGML_API void   gguf_get_meta_data(const struct gguf_context * ctx, void * data);
+    WSP_GGML_API size_t wsp_gguf_get_meta_size(const struct wsp_gguf_context * ctx);
+    WSP_GGML_API void   wsp_gguf_get_meta_data(const struct wsp_gguf_context * ctx, void * data);
     //
     // system info
@@ -2008,7 +2206,7 @@ extern "C" {
         enum wsp_ggml_type    vec_dot_type;
     } wsp_ggml_type_traits_t;
-    wsp_ggml_type_traits_t wsp_ggml_internal_get_type_traits(enum wsp_ggml_type type);
+    WSP_GGML_API wsp_ggml_type_traits_t wsp_ggml_internal_get_type_traits(enum wsp_ggml_type type);
 #ifdef  __cplusplus
 }