npm - whisper.rn - Versions diffs - 0.3.2 → 0.3.4 - Mend

whisper.rn 0.3.2 → 0.3.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

package/cpp/ggml.c +5349 -5349
package/cpp/ggml.h +810 -810
package/cpp/whisper.cpp +518 -518
package/cpp/whisper.h +2 -2
package/lib/commonjs/NativeRNWhisper.js.map +1 -1
package/lib/commonjs/index.js +3 -0
package/lib/commonjs/index.js.map +1 -1
package/lib/module/NativeRNWhisper.js +3 -0
package/lib/module/NativeRNWhisper.js.map +1 -1
package/lib/module/index.js +3 -0
package/lib/module/index.js.map +1 -1
package/lib/typescript/NativeRNWhisper.d.ts +1 -3
package/lib/typescript/NativeRNWhisper.d.ts.map +1 -1
package/lib/typescript/index.d.ts.map +1 -1
package/package.json +1 -1
package/src/NativeRNWhisper.ts +2 -3
package/src/index.ts +2 -1
package/whisper-rn.podspec +1 -1

package/cpp/whisper.cpp CHANGED Viewed

@@ -28,7 +28,7 @@
 #pragma warning(disable: 4244 4267) // possible loss of data
 #endif
-#if defined(GGML_BIG_ENDIAN)
+#if defined(WSP_GGML_BIG_ENDIAN)
 #include <bit>
 template<typename T>
@@ -42,28 +42,28 @@ float byteswap(float value) {
 }
 template<typename T>
-static void byteswap_tensor_data(ggml_tensor * tensor) {
+static void byteswap_tensor_data(wsp_ggml_tensor * tensor) {
     T * datum = reinterpret_cast<T *>(tensor->data);
-    for (int i = 0; i < ggml_nelements(tensor); i++) {
+    for (int i = 0; i < wsp_ggml_nelements(tensor); i++) {
         datum[i] = byteswap(datum[i]);
     }
 }
-static void byteswap_tensor(ggml_tensor * tensor) {
+static void byteswap_tensor(wsp_ggml_tensor * tensor) {
     switch (tensor->type) {
-        case GGML_TYPE_I16: {
+        case WSP_GGML_TYPE_I16: {
             byteswap_tensor_data<int16_t>(tensor);
             break;
         }
-        case GGML_TYPE_F16: {
-            byteswap_tensor_data<ggml_fp16_t>(tensor);
+        case WSP_GGML_TYPE_F16: {
+            byteswap_tensor_data<wsp_ggml_fp16_t>(tensor);
             break;
         }
-        case GGML_TYPE_I32: {
+        case WSP_GGML_TYPE_I32: {
             byteswap_tensor_data<int32_t>(tensor);
             break;
         }
-        case GGML_TYPE_F32: {
+        case WSP_GGML_TYPE_F32: {
             byteswap_tensor_data<float>(tensor);
             break;
         }
@@ -263,8 +263,8 @@ static const std::map<e_model, size_t> MEM_REQ_SCRATCH3 = {
     { MODEL_LARGE,     9ull*MB },
 };
-static const std::map<ggml_type, std::map<e_model, size_t>> MEM_REQ_MODEL = {
-    { GGML_TYPE_F32,
+static const std::map<wsp_ggml_type, std::map<e_model, size_t>> MEM_REQ_MODEL = {
+    { WSP_GGML_TYPE_F32,
         {
             { MODEL_TINY,     74ull*MB },
             { MODEL_BASE,    142ull*MB },
@@ -273,7 +273,7 @@ static const std::map<ggml_type, std::map<e_model, size_t>> MEM_REQ_MODEL = {
             { MODEL_LARGE,  2952ull*MB },
         },
     },
-    { GGML_TYPE_F16,
+    { WSP_GGML_TYPE_F16,
         {
             { MODEL_TINY,     74ull*MB },
             { MODEL_BASE,    142ull*MB },
@@ -282,7 +282,7 @@ static const std::map<ggml_type, std::map<e_model, size_t>> MEM_REQ_MODEL = {
             { MODEL_LARGE,  2952ull*MB },
         },
     },
-    { GGML_TYPE_Q4_0,
+    { WSP_GGML_TYPE_Q4_0,
         {
             { MODEL_TINY,     26ull*MB },
             { MODEL_BASE,     50ull*MB },
@@ -291,7 +291,7 @@ static const std::map<ggml_type, std::map<e_model, size_t>> MEM_REQ_MODEL = {
             { MODEL_LARGE,   940ull*MB },
         },
     },
-    { GGML_TYPE_Q4_1,
+    { WSP_GGML_TYPE_Q4_1,
         {
             { MODEL_TINY,     32ull*MB },
             { MODEL_BASE,     58ull*MB },
@@ -300,7 +300,7 @@ static const std::map<ggml_type, std::map<e_model, size_t>> MEM_REQ_MODEL = {
             { MODEL_LARGE,  1124ull*MB },
         },
     },
-    { GGML_TYPE_Q5_0,
+    { WSP_GGML_TYPE_Q5_0,
         {
             { MODEL_TINY,     30ull*MB },
             { MODEL_BASE,     54ull*MB },
@@ -309,7 +309,7 @@ static const std::map<ggml_type, std::map<e_model, size_t>> MEM_REQ_MODEL = {
             { MODEL_LARGE,  1034ull*MB },
         },
     },
-    { GGML_TYPE_Q5_1,
+    { WSP_GGML_TYPE_Q5_1,
         {
             { MODEL_TINY,     32ull*MB },
             { MODEL_BASE,     58ull*MB },
@@ -318,7 +318,7 @@ static const std::map<ggml_type, std::map<e_model, size_t>> MEM_REQ_MODEL = {
             { MODEL_LARGE,  1124ull*MB },
         },
     },
-    { GGML_TYPE_Q8_0,
+    { WSP_GGML_TYPE_Q8_0,
         {
             { MODEL_TINY,     45ull*MB },
             { MODEL_BASE,     84ull*MB },
@@ -446,95 +446,95 @@ struct whisper_hparams {
 // audio encoding layer
 struct whisper_layer_encoder {
     // encoder.blocks.*.attn_ln
-    struct ggml_tensor * attn_ln_0_w;
-    struct ggml_tensor * attn_ln_0_b;
+    struct wsp_ggml_tensor * attn_ln_0_w;
+    struct wsp_ggml_tensor * attn_ln_0_b;
     // encoder.blocks.*.attn.out
-    struct ggml_tensor * attn_ln_1_w;
-    struct ggml_tensor * attn_ln_1_b;
+    struct wsp_ggml_tensor * attn_ln_1_w;
+    struct wsp_ggml_tensor * attn_ln_1_b;
     // encoder.blocks.*.attn.query
-    struct ggml_tensor * attn_q_w;
-    struct ggml_tensor * attn_q_b;
+    struct wsp_ggml_tensor * attn_q_w;
+    struct wsp_ggml_tensor * attn_q_b;
     // encoder.blocks.*.attn.key
-    struct ggml_tensor * attn_k_w;
+    struct wsp_ggml_tensor * attn_k_w;
     // encoder.blocks.*.attn.value
-    struct ggml_tensor * attn_v_w;
-    struct ggml_tensor * attn_v_b;
+    struct wsp_ggml_tensor * attn_v_w;
+    struct wsp_ggml_tensor * attn_v_b;
     // encoder.blocks.*.mlp_ln
-    struct ggml_tensor * mlp_ln_w;
-    struct ggml_tensor * mlp_ln_b;
+    struct wsp_ggml_tensor * mlp_ln_w;
+    struct wsp_ggml_tensor * mlp_ln_b;
     // encoder.blocks.*.mlp.0
-    struct ggml_tensor * mlp_0_w;
-    struct ggml_tensor * mlp_0_b;
+    struct wsp_ggml_tensor * mlp_0_w;
+    struct wsp_ggml_tensor * mlp_0_b;
     // encoder.blocks.*.mlp.2
-    struct ggml_tensor * mlp_1_w;
-    struct ggml_tensor * mlp_1_b;
+    struct wsp_ggml_tensor * mlp_1_w;
+    struct wsp_ggml_tensor * mlp_1_b;
 };
 // token decoding layer
 struct whisper_layer_decoder {
     // decoder.blocks.*.attn_ln
-    struct ggml_tensor * attn_ln_0_w;
-    struct ggml_tensor * attn_ln_0_b;
+    struct wsp_ggml_tensor * attn_ln_0_w;
+    struct wsp_ggml_tensor * attn_ln_0_b;
     // decoder.blocks.*.attn.out
-    struct ggml_tensor * attn_ln_1_w;
-    struct ggml_tensor * attn_ln_1_b;
+    struct wsp_ggml_tensor * attn_ln_1_w;
+    struct wsp_ggml_tensor * attn_ln_1_b;
     // decoder.blocks.*.attn.query
-    struct ggml_tensor * attn_q_w;
-    struct ggml_tensor * attn_q_b;
+    struct wsp_ggml_tensor * attn_q_w;
+    struct wsp_ggml_tensor * attn_q_b;
     // decoder.blocks.*.attn.key
-    struct ggml_tensor * attn_k_w;
+    struct wsp_ggml_tensor * attn_k_w;
     // decoder.blocks.*.attn.value
-    struct ggml_tensor * attn_v_w;
-    struct ggml_tensor * attn_v_b;
+    struct wsp_ggml_tensor * attn_v_w;
+    struct wsp_ggml_tensor * attn_v_b;
     // decoder.blocks.*.cross_attn_ln
-    struct ggml_tensor * cross_attn_ln_0_w;
-    struct ggml_tensor * cross_attn_ln_0_b;
+    struct wsp_ggml_tensor * cross_attn_ln_0_w;
+    struct wsp_ggml_tensor * cross_attn_ln_0_b;
     // decoder.blocks.*.cross_attn.out
-    struct ggml_tensor * cross_attn_ln_1_w;
-    struct ggml_tensor * cross_attn_ln_1_b;
+    struct wsp_ggml_tensor * cross_attn_ln_1_w;
+    struct wsp_ggml_tensor * cross_attn_ln_1_b;
     // decoder.blocks.*.cross_attn.query
-    struct ggml_tensor * cross_attn_q_w;
-    struct ggml_tensor * cross_attn_q_b;
+    struct wsp_ggml_tensor * cross_attn_q_w;
+    struct wsp_ggml_tensor * cross_attn_q_b;
     // decoder.blocks.*.cross_attn.key
-    struct ggml_tensor * cross_attn_k_w;
+    struct wsp_ggml_tensor * cross_attn_k_w;
     // decoder.blocks.*.cross_attn.value
-    struct ggml_tensor * cross_attn_v_w;
-    struct ggml_tensor * cross_attn_v_b;
+    struct wsp_ggml_tensor * cross_attn_v_w;
+    struct wsp_ggml_tensor * cross_attn_v_b;
     // decoder.blocks.*.mlp_ln
-    struct ggml_tensor * mlp_ln_w;
-    struct ggml_tensor * mlp_ln_b;
+    struct wsp_ggml_tensor * mlp_ln_w;
+    struct wsp_ggml_tensor * mlp_ln_b;
     // decoder.blocks.*.mlp.0
-    struct ggml_tensor * mlp_0_w;
-    struct ggml_tensor * mlp_0_b;
+    struct wsp_ggml_tensor * mlp_0_w;
+    struct wsp_ggml_tensor * mlp_0_b;
     // decoder.blocks.*.mlp.2
-    struct ggml_tensor * mlp_1_w;
-    struct ggml_tensor * mlp_1_b;
+    struct wsp_ggml_tensor * mlp_1_w;
+    struct wsp_ggml_tensor * mlp_1_b;
 };
 struct whisper_kv_cache {
-    struct ggml_tensor * k;
-    struct ggml_tensor * v;
+    struct wsp_ggml_tensor * k;
+    struct wsp_ggml_tensor * v;
-    struct ggml_context * ctx;
+    struct wsp_ggml_context * ctx;
     std::vector<uint8_t> buf;
@@ -548,42 +548,42 @@ struct whisper_model {
     whisper_filters filters;
     // encoder.positional_embedding
-    struct ggml_tensor * e_pe;
+    struct wsp_ggml_tensor * e_pe;
     // encoder.conv1
-    struct ggml_tensor * e_conv_1_w;
-    struct ggml_tensor * e_conv_1_b;
+    struct wsp_ggml_tensor * e_conv_1_w;
+    struct wsp_ggml_tensor * e_conv_1_b;
     // encoder.conv2
-    struct ggml_tensor * e_conv_2_w;
-    struct ggml_tensor * e_conv_2_b;
+    struct wsp_ggml_tensor * e_conv_2_w;
+    struct wsp_ggml_tensor * e_conv_2_b;
     // encoder.ln_post
-    struct ggml_tensor * e_ln_w;
-    struct ggml_tensor * e_ln_b;
+    struct wsp_ggml_tensor * e_ln_w;
+    struct wsp_ggml_tensor * e_ln_b;
     // decoder.positional_embedding
-    struct ggml_tensor * d_pe;
+    struct wsp_ggml_tensor * d_pe;
     // decoder.token_embedding
-    struct ggml_tensor * d_te;
+    struct wsp_ggml_tensor * d_te;
     // decoder.ln
-    struct ggml_tensor * d_ln_w;
-    struct ggml_tensor * d_ln_b;
+    struct wsp_ggml_tensor * d_ln_w;
+    struct wsp_ggml_tensor * d_ln_b;
     std::vector<whisper_layer_encoder> layers_encoder;
     std::vector<whisper_layer_decoder> layers_decoder;
     // context
-    struct ggml_context * ctx;
+    struct wsp_ggml_context * ctx;
     // the model memory buffer is read-only and can be shared between processors
     std::vector<uint8_t> * buf;
     // tensors
     int n_loaded;
-    std::map<std::string, struct ggml_tensor *> tensors;
+    std::map<std::string, struct wsp_ggml_tensor *> tensors;
 };
 struct whisper_sequence {
@@ -678,15 +678,15 @@ struct whisper_state {
     // [EXPERIMENTAL] speed-up techniques
     int32_t exp_n_audio_ctx = 0; // 0 - use default
-    void use_buf(struct ggml_context * ctx, int i) {
+    void use_buf(struct wsp_ggml_context * ctx, int i) {
 #if defined(WHISPER_USE_SCRATCH)
         size_t last_size = 0;
         if (i == -1) {
-            last_size = ggml_set_scratch(ctx, { 0, 0, nullptr, });
+            last_size = wsp_ggml_set_scratch(ctx, { 0, 0, nullptr, });
         } else {
             auto & buf = buf_scratch[i];
-            last_size = ggml_set_scratch(ctx, { 0, buf.size(), buf.data(), });
+            last_size = wsp_ggml_set_scratch(ctx, { 0, buf.size(), buf.data(), });
         }
         if (buf_last >= 0) {
@@ -714,8 +714,8 @@ struct whisper_context {
     int64_t t_load_us  = 0;
     int64_t t_start_us = 0;
-    ggml_type wtype = ggml_type::GGML_TYPE_F16; // weight type (FP32 / FP16 / QX)
-    ggml_type itype = ggml_type::GGML_TYPE_F16; // intermediate type (FP32 or FP16)
+    wsp_ggml_type wtype = wsp_ggml_type::WSP_GGML_TYPE_F16; // weight type (FP32 / FP16 / QX)
+    wsp_ggml_type itype = wsp_ggml_type::WSP_GGML_TYPE_F16; // intermediate type (FP32 or FP16)
     whisper_model model;
     whisper_vocab vocab;
@@ -749,17 +749,17 @@ static bool kv_cache_init(
         const struct whisper_hparams & hparams,
                         const size_t   mem_bytes,
              struct whisper_kv_cache & cache,
-                           ggml_type   wtype,
+                           wsp_ggml_type   wtype,
                                  int   n_ctx) {
     cache.buf.resize(mem_bytes);
-    struct ggml_init_params params = {
+    struct wsp_ggml_init_params params = {
         /*.mem_size   =*/ cache.buf.size(),
         /*.mem_buffer =*/ cache.buf.data(),
         /*.no_alloc   =*/ false,
     };
-    cache.ctx = ggml_init(params);
+    cache.ctx = wsp_ggml_init(params);
     if (!cache.ctx) {
         log("%s: failed to allocate memory for kv cache\n", __func__);
@@ -772,8 +772,8 @@ static bool kv_cache_init(
     const int n_mem      = n_text_layer*n_ctx;
     const int n_elements = n_text_state*n_mem;
-    cache.k = ggml_new_tensor_1d(cache.ctx, wtype, n_elements);
-    cache.v = ggml_new_tensor_1d(cache.ctx, wtype, n_elements);
+    cache.k = wsp_ggml_new_tensor_1d(cache.ctx, wtype, n_elements);
+    cache.v = wsp_ggml_new_tensor_1d(cache.ctx, wtype, n_elements);
     return true;
 }
@@ -781,36 +781,36 @@ static bool kv_cache_init(
 static bool kv_cache_reinit(struct whisper_kv_cache & cache) {
     WHISPER_ASSERT(cache.ctx);
-    const int n_elements = ggml_nelements(cache.k);
-    WHISPER_ASSERT(n_elements == ggml_nelements(cache.v));
+    const int n_elements = wsp_ggml_nelements(cache.k);
+    WHISPER_ASSERT(n_elements == wsp_ggml_nelements(cache.v));
-    const ggml_type wtype = cache.k->type;
+    const wsp_ggml_type wtype = cache.k->type;
     WHISPER_ASSERT(wtype == cache.v->type);
-    WHISPER_ASSERT(cache.buf.size() >= 2*n_elements*ggml_type_sizef(wtype));
+    WHISPER_ASSERT(cache.buf.size() >= 2*n_elements*wsp_ggml_type_sizef(wtype));
-    struct ggml_init_params params = {
+    struct wsp_ggml_init_params params = {
         /*.mem_size   =*/ cache.buf.size(),
         /*.mem_buffer =*/ cache.buf.data(),
         /*.no_alloc   =*/ false,
     };
-    cache.ctx = ggml_init(params);
+    cache.ctx = wsp_ggml_init(params);
     if (!cache.ctx) {
         log("%s: failed to allocate memory for kv cache\n", __func__);
         return false;
     }
-    cache.k = ggml_new_tensor_1d(cache.ctx, wtype, n_elements);
-    cache.v = ggml_new_tensor_1d(cache.ctx, wtype, n_elements);
+    cache.k = wsp_ggml_new_tensor_1d(cache.ctx, wtype, n_elements);
+    cache.v = wsp_ggml_new_tensor_1d(cache.ctx, wtype, n_elements);
     return true;
 }
 static void kv_cache_free(struct whisper_kv_cache & cache) {
     if (cache.ctx) {
-        ggml_free(cache.ctx);
+        wsp_ggml_free(cache.ctx);
         cache.ctx = nullptr;
     }
 }
@@ -829,7 +829,7 @@ static void kv_cache_free(struct whisper_kv_cache & cache) {
 static bool whisper_model_load(struct whisper_model_loader * loader, whisper_context & wctx) {
     log("%s: loading model\n", __func__);
-    const int64_t t_start_us = ggml_time_us();
+    const int64_t t_start_us = wsp_ggml_time_us();
     wctx.t_start_us = t_start_us;
@@ -840,7 +840,7 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
     {
         uint32_t magic;
         read_safe(loader, magic);
-        if (magic != GGML_FILE_MAGIC) {
+        if (magic != WSP_GGML_FILE_MAGIC) {
             log("%s: invalid model data (bad magic)\n", __func__);
             return false;
         }
@@ -884,14 +884,14 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
             model.type = e_model::MODEL_LARGE;
         }
-        const int32_t qntvr = hparams.ftype / GGML_QNT_VERSION_FACTOR;
+        const int32_t qntvr = hparams.ftype / WSP_GGML_QNT_VERSION_FACTOR;
-        hparams.ftype %= GGML_QNT_VERSION_FACTOR;
+        hparams.ftype %= WSP_GGML_QNT_VERSION_FACTOR;
         // for the big tensors, we have the option to store the data in 16-bit floats or quantized
         // in order to save memory and also to speed up the computation
-        wctx.wtype = ggml_ftype_to_ggml_type((ggml_ftype) (model.hparams.ftype));
-        if (wctx.wtype == GGML_TYPE_COUNT) {
+        wctx.wtype = wsp_ggml_ftype_to_wsp_ggml_type((wsp_ggml_ftype) (model.hparams.ftype));
+        if (wctx.wtype == WSP_GGML_TYPE_COUNT) {
             log("%s: invalid model (bad ftype value %d)\n", __func__, model.hparams.ftype);
             return false;
         }
@@ -1033,8 +1033,8 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
     size_t ctx_size = 0;
-    const ggml_type wtype = wctx.wtype;
-    const ggml_type vtype = wctx.wtype == GGML_TYPE_F32 ? GGML_TYPE_F32 : GGML_TYPE_F16; // conv type
+    const wsp_ggml_type wtype = wctx.wtype;
+    const wsp_ggml_type vtype = wctx.wtype == WSP_GGML_TYPE_F32 ? WSP_GGML_TYPE_F32 : WSP_GGML_TYPE_F16; // conv type
     {
         const auto & hparams = model.hparams;
@@ -1053,92 +1053,92 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
         // encoder
         {
-            ctx_size += n_audio_ctx*n_audio_state*ggml_type_sizef(GGML_TYPE_F32); // e_pe;
+            ctx_size += n_audio_ctx*n_audio_state*wsp_ggml_type_sizef(WSP_GGML_TYPE_F32); // e_pe;
-            ctx_size += 3*n_mels*n_audio_state*ggml_type_sizef(vtype);         // e_conv_1_w
-            ctx_size +=          n_audio_state*ggml_type_sizef(GGML_TYPE_F32); // e_conv_1_b
+            ctx_size += 3*n_mels*n_audio_state*wsp_ggml_type_sizef(vtype);         // e_conv_1_w
+            ctx_size +=          n_audio_state*wsp_ggml_type_sizef(WSP_GGML_TYPE_F32); // e_conv_1_b
-            ctx_size += 3*n_audio_state*n_audio_state*ggml_type_sizef(vtype);         // e_conv_2_w
-            ctx_size +=                 n_audio_state*ggml_type_sizef(GGML_TYPE_F32); // e_conv_2_b
+            ctx_size += 3*n_audio_state*n_audio_state*wsp_ggml_type_sizef(vtype);         // e_conv_2_w
+            ctx_size +=                 n_audio_state*wsp_ggml_type_sizef(WSP_GGML_TYPE_F32); // e_conv_2_b
-            ctx_size += n_audio_state*ggml_type_sizef(GGML_TYPE_F32); // e_ln_w;
-            ctx_size += n_audio_state*ggml_type_sizef(GGML_TYPE_F32); // e_ln_b;
+            ctx_size += n_audio_state*wsp_ggml_type_sizef(WSP_GGML_TYPE_F32); // e_ln_w;
+            ctx_size += n_audio_state*wsp_ggml_type_sizef(WSP_GGML_TYPE_F32); // e_ln_b;
         }
         // decoder
         {
-            ctx_size += n_text_ctx*n_text_state*ggml_type_sizef(GGML_TYPE_F32); // d_pe;
+            ctx_size += n_text_ctx*n_text_state*wsp_ggml_type_sizef(WSP_GGML_TYPE_F32); // d_pe;
-            ctx_size += n_vocab*n_text_state*ggml_type_sizef(wtype); // d_te;
+            ctx_size += n_vocab*n_text_state*wsp_ggml_type_sizef(wtype); // d_te;
-            ctx_size += n_text_state*ggml_type_sizef(GGML_TYPE_F32); // d_ln_w;
-            ctx_size += n_text_state*ggml_type_sizef(GGML_TYPE_F32); // d_ln_b;
+            ctx_size += n_text_state*wsp_ggml_type_sizef(WSP_GGML_TYPE_F32); // d_ln_w;
+            ctx_size += n_text_state*wsp_ggml_type_sizef(WSP_GGML_TYPE_F32); // d_ln_b;
         }
         // encoder layers
         {
-            ctx_size += n_audio_layer*(n_audio_state*ggml_type_sizef(GGML_TYPE_F32)); // mlp_ln_w
-            ctx_size += n_audio_layer*(n_audio_state*ggml_type_sizef(GGML_TYPE_F32)); // mlp_ln_b
+            ctx_size += n_audio_layer*(n_audio_state*wsp_ggml_type_sizef(WSP_GGML_TYPE_F32)); // mlp_ln_w
+            ctx_size += n_audio_layer*(n_audio_state*wsp_ggml_type_sizef(WSP_GGML_TYPE_F32)); // mlp_ln_b
-            ctx_size += n_audio_layer*(4*n_audio_state*n_audio_state*ggml_type_sizef(wtype));         // mlp_0_w
-            ctx_size += n_audio_layer*(              4*n_audio_state*ggml_type_sizef(GGML_TYPE_F32)); // mlp_0_b
+            ctx_size += n_audio_layer*(4*n_audio_state*n_audio_state*wsp_ggml_type_sizef(wtype));         // mlp_0_w
+            ctx_size += n_audio_layer*(              4*n_audio_state*wsp_ggml_type_sizef(WSP_GGML_TYPE_F32)); // mlp_0_b
-            ctx_size += n_audio_layer*(4*n_audio_state*n_audio_state*ggml_type_sizef(wtype));         // mlp_1_w
-            ctx_size += n_audio_layer*(                n_audio_state*ggml_type_sizef(GGML_TYPE_F32)); // mlp_1_b
+            ctx_size += n_audio_layer*(4*n_audio_state*n_audio_state*wsp_ggml_type_sizef(wtype));         // mlp_1_w
+            ctx_size += n_audio_layer*(                n_audio_state*wsp_ggml_type_sizef(WSP_GGML_TYPE_F32)); // mlp_1_b
-            ctx_size += n_audio_layer*(n_audio_state*ggml_type_sizef(GGML_TYPE_F32)); // attn_ln_0_w
-            ctx_size += n_audio_layer*(n_audio_state*ggml_type_sizef(GGML_TYPE_F32)); // attn_ln_0_b
+            ctx_size += n_audio_layer*(n_audio_state*wsp_ggml_type_sizef(WSP_GGML_TYPE_F32)); // attn_ln_0_w
+            ctx_size += n_audio_layer*(n_audio_state*wsp_ggml_type_sizef(WSP_GGML_TYPE_F32)); // attn_ln_0_b
-            ctx_size += n_audio_layer*(n_audio_state*n_audio_state*ggml_type_sizef(wtype));         // attn_q_w
-            ctx_size += n_audio_layer*(              n_audio_state*ggml_type_sizef(GGML_TYPE_F32)); // attn_q_b
+            ctx_size += n_audio_layer*(n_audio_state*n_audio_state*wsp_ggml_type_sizef(wtype));         // attn_q_w
+            ctx_size += n_audio_layer*(              n_audio_state*wsp_ggml_type_sizef(WSP_GGML_TYPE_F32)); // attn_q_b
-            ctx_size += n_audio_layer*(n_audio_state*n_audio_state*ggml_type_sizef(wtype)); // attn_k_w
+            ctx_size += n_audio_layer*(n_audio_state*n_audio_state*wsp_ggml_type_sizef(wtype)); // attn_k_w
-            ctx_size += n_audio_layer*(n_audio_state*n_audio_state*ggml_type_sizef(wtype));         // attn_v_w
-            ctx_size += n_audio_layer*(              n_audio_state*ggml_type_sizef(GGML_TYPE_F32)); // attn_v_b
+            ctx_size += n_audio_layer*(n_audio_state*n_audio_state*wsp_ggml_type_sizef(wtype));         // attn_v_w
+            ctx_size += n_audio_layer*(              n_audio_state*wsp_ggml_type_sizef(WSP_GGML_TYPE_F32)); // attn_v_b
-            ctx_size += n_audio_layer*(n_audio_state*n_audio_state*ggml_type_sizef(wtype));         // attn_ln_1_w
-            ctx_size += n_audio_layer*(              n_audio_state*ggml_type_sizef(GGML_TYPE_F32)); // attn_ln_1_b
+            ctx_size += n_audio_layer*(n_audio_state*n_audio_state*wsp_ggml_type_sizef(wtype));         // attn_ln_1_w
+            ctx_size += n_audio_layer*(              n_audio_state*wsp_ggml_type_sizef(WSP_GGML_TYPE_F32)); // attn_ln_1_b
         }
         // decoder layers
         {
-            ctx_size += n_text_layer*(n_text_state*ggml_type_sizef(GGML_TYPE_F32)); // mlp_ln_w
-            ctx_size += n_text_layer*(n_text_state*ggml_type_sizef(GGML_TYPE_F32)); // mlp_ln_b
+            ctx_size += n_text_layer*(n_text_state*wsp_ggml_type_sizef(WSP_GGML_TYPE_F32)); // mlp_ln_w
+            ctx_size += n_text_layer*(n_text_state*wsp_ggml_type_sizef(WSP_GGML_TYPE_F32)); // mlp_ln_b
-            ctx_size += n_text_layer*(4*n_text_state*n_text_state*ggml_type_sizef(wtype));         // mlp_0_w
-            ctx_size += n_text_layer*(             4*n_text_state*ggml_type_sizef(GGML_TYPE_F32)); // mlp_0_b
+            ctx_size += n_text_layer*(4*n_text_state*n_text_state*wsp_ggml_type_sizef(wtype));         // mlp_0_w
+            ctx_size += n_text_layer*(             4*n_text_state*wsp_ggml_type_sizef(WSP_GGML_TYPE_F32)); // mlp_0_b
-            ctx_size += n_text_layer*(4*n_text_state*n_text_state*ggml_type_sizef(wtype));         // mlp_1_w
-            ctx_size += n_text_layer*(               n_text_state*ggml_type_sizef(GGML_TYPE_F32)); // mlp_1_b
+            ctx_size += n_text_layer*(4*n_text_state*n_text_state*wsp_ggml_type_sizef(wtype));         // mlp_1_w
+            ctx_size += n_text_layer*(               n_text_state*wsp_ggml_type_sizef(WSP_GGML_TYPE_F32)); // mlp_1_b
-            ctx_size += n_text_layer*(n_text_state*ggml_type_sizef(GGML_TYPE_F32)); // attn_ln_0_w
-            ctx_size += n_text_layer*(n_text_state*ggml_type_sizef(GGML_TYPE_F32)); // attn_ln_0_b
+            ctx_size += n_text_layer*(n_text_state*wsp_ggml_type_sizef(WSP_GGML_TYPE_F32)); // attn_ln_0_w
+            ctx_size += n_text_layer*(n_text_state*wsp_ggml_type_sizef(WSP_GGML_TYPE_F32)); // attn_ln_0_b
-            ctx_size += n_text_layer*(n_text_state*n_text_state*ggml_type_sizef(wtype));         // attn_q_w
-            ctx_size += n_text_layer*(             n_text_state*ggml_type_sizef(GGML_TYPE_F32)); // attn_q_b
+            ctx_size += n_text_layer*(n_text_state*n_text_state*wsp_ggml_type_sizef(wtype));         // attn_q_w
+            ctx_size += n_text_layer*(             n_text_state*wsp_ggml_type_sizef(WSP_GGML_TYPE_F32)); // attn_q_b
-            ctx_size += n_text_layer*(n_text_state*n_text_state*ggml_type_sizef(wtype)); // attn_k_w
+            ctx_size += n_text_layer*(n_text_state*n_text_state*wsp_ggml_type_sizef(wtype)); // attn_k_w
-            ctx_size += n_text_layer*(n_text_state*n_text_state*ggml_type_sizef(wtype));         // attn_v_w
-            ctx_size += n_text_layer*(             n_text_state*ggml_type_sizef(GGML_TYPE_F32)); // attn_v_b
+            ctx_size += n_text_layer*(n_text_state*n_text_state*wsp_ggml_type_sizef(wtype));         // attn_v_w
+            ctx_size += n_text_layer*(             n_text_state*wsp_ggml_type_sizef(WSP_GGML_TYPE_F32)); // attn_v_b
-            ctx_size += n_text_layer*(n_text_state*n_text_state*ggml_type_sizef(wtype));         // attn_ln_1_w
-            ctx_size += n_text_layer*(             n_text_state*ggml_type_sizef(GGML_TYPE_F32)); // attn_ln_1_b
+            ctx_size += n_text_layer*(n_text_state*n_text_state*wsp_ggml_type_sizef(wtype));         // attn_ln_1_w
+            ctx_size += n_text_layer*(             n_text_state*wsp_ggml_type_sizef(WSP_GGML_TYPE_F32)); // attn_ln_1_b
                                                                                                 //
-            ctx_size += n_text_layer*(n_text_state*ggml_type_sizef(GGML_TYPE_F32)); // cross_attn_ln_0_w
-            ctx_size += n_text_layer*(n_text_state*ggml_type_sizef(GGML_TYPE_F32)); // cross_attn_ln_0_b
+            ctx_size += n_text_layer*(n_text_state*wsp_ggml_type_sizef(WSP_GGML_TYPE_F32)); // cross_attn_ln_0_w
+            ctx_size += n_text_layer*(n_text_state*wsp_ggml_type_sizef(WSP_GGML_TYPE_F32)); // cross_attn_ln_0_b
-            ctx_size += n_text_layer*(n_text_state*n_text_state*ggml_type_sizef(wtype));         // cross_attn_q_w
-            ctx_size += n_text_layer*(             n_text_state*ggml_type_sizef(GGML_TYPE_F32)); // cross_attn_q_b
+            ctx_size += n_text_layer*(n_text_state*n_text_state*wsp_ggml_type_sizef(wtype));         // cross_attn_q_w
+            ctx_size += n_text_layer*(             n_text_state*wsp_ggml_type_sizef(WSP_GGML_TYPE_F32)); // cross_attn_q_b
-            ctx_size += n_text_layer*(n_text_state*n_text_state*ggml_type_sizef(wtype)); // cross_attn_k_w
+            ctx_size += n_text_layer*(n_text_state*n_text_state*wsp_ggml_type_sizef(wtype)); // cross_attn_k_w
-            ctx_size += n_text_layer*(n_text_state*n_text_state*ggml_type_sizef(wtype));         // cross_attn_v_w
-            ctx_size += n_text_layer*(             n_text_state*ggml_type_sizef(GGML_TYPE_F32)); // cross_attn_v_b
+            ctx_size += n_text_layer*(n_text_state*n_text_state*wsp_ggml_type_sizef(wtype));         // cross_attn_v_w
+            ctx_size += n_text_layer*(             n_text_state*wsp_ggml_type_sizef(WSP_GGML_TYPE_F32)); // cross_attn_v_b
-            ctx_size += n_text_layer*(n_text_state*n_text_state*ggml_type_sizef(wtype));         // cross_attn_ln_1_w
-            ctx_size += n_text_layer*(             n_text_state*ggml_type_sizef(GGML_TYPE_F32)); // cross_attn_ln_1_b
+            ctx_size += n_text_layer*(n_text_state*n_text_state*wsp_ggml_type_sizef(wtype));         // cross_attn_ln_1_w
+            ctx_size += n_text_layer*(             n_text_state*wsp_ggml_type_sizef(WSP_GGML_TYPE_F32)); // cross_attn_ln_1_b
         }
         ctx_size += (15 + 15*n_audio_layer + 24*n_text_layer)*512; // object overhead
@@ -1148,15 +1148,15 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
     // create the ggml context
     {
-        struct ggml_init_params params = {
+        struct wsp_ggml_init_params params = {
             /*.mem_size   =*/ wctx.model.buf->size(),
             /*.mem_buffer =*/ wctx.model.buf->data(),
             /*.no_alloc   =*/ false,
         };
-        model.ctx = ggml_init(params);
+        model.ctx = wsp_ggml_init(params);
         if (!model.ctx) {
-            log("%s: ggml_init() failed\n", __func__);
+            log("%s: wsp_ggml_init() failed\n", __func__);
             return false;
         }
     }
@@ -1184,16 +1184,16 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
         // encoder
         {
-            model.e_pe       = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_audio_state, n_audio_ctx);
+            model.e_pe       = wsp_ggml_new_tensor_2d(ctx, WSP_GGML_TYPE_F32, n_audio_state, n_audio_ctx);
-            model.e_conv_1_w = ggml_new_tensor_3d(ctx, vtype,         3, n_mels, n_audio_state);
-            model.e_conv_1_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 1, n_audio_state);
+            model.e_conv_1_w = wsp_ggml_new_tensor_3d(ctx, vtype,         3, n_mels, n_audio_state);
+            model.e_conv_1_b = wsp_ggml_new_tensor_2d(ctx, WSP_GGML_TYPE_F32, 1, n_audio_state);
-            model.e_conv_2_w = ggml_new_tensor_3d(ctx, vtype,         3, n_audio_state, n_audio_state);
-            model.e_conv_2_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 1, n_audio_state);
+            model.e_conv_2_w = wsp_ggml_new_tensor_3d(ctx, vtype,         3, n_audio_state, n_audio_state);
+            model.e_conv_2_b = wsp_ggml_new_tensor_2d(ctx, WSP_GGML_TYPE_F32, 1, n_audio_state);
-            model.e_ln_w     = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_state);
-            model.e_ln_b     = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_state);
+            model.e_ln_w     = wsp_ggml_new_tensor_1d(ctx, WSP_GGML_TYPE_F32, n_audio_state);
+            model.e_ln_b     = wsp_ggml_new_tensor_1d(ctx, WSP_GGML_TYPE_F32, n_audio_state);
             // map by name
             model.tensors["encoder.positional_embedding"] = model.e_pe;
@@ -1210,28 +1210,28 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
             for (int i = 0; i < n_audio_layer; ++i) {
                 auto & layer = model.layers_encoder[i];
-                layer.mlp_ln_w    = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_audio_state);
-                layer.mlp_ln_b    = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_audio_state);
+                layer.mlp_ln_w    = wsp_ggml_new_tensor_1d(ctx, WSP_GGML_TYPE_F32,   n_audio_state);
+                layer.mlp_ln_b    = wsp_ggml_new_tensor_1d(ctx, WSP_GGML_TYPE_F32,   n_audio_state);
-                layer.mlp_0_w     = ggml_new_tensor_2d(ctx, wtype,           n_audio_state, 4*n_audio_state);
-                layer.mlp_0_b     = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 4*n_audio_state);
+                layer.mlp_0_w     = wsp_ggml_new_tensor_2d(ctx, wtype,           n_audio_state, 4*n_audio_state);
+                layer.mlp_0_b     = wsp_ggml_new_tensor_1d(ctx, WSP_GGML_TYPE_F32, 4*n_audio_state);
-                layer.mlp_1_w     = ggml_new_tensor_2d(ctx, wtype,         4*n_audio_state, n_audio_state);
-                layer.mlp_1_b     = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_audio_state);
+                layer.mlp_1_w     = wsp_ggml_new_tensor_2d(ctx, wtype,         4*n_audio_state, n_audio_state);
+                layer.mlp_1_b     = wsp_ggml_new_tensor_1d(ctx, WSP_GGML_TYPE_F32,   n_audio_state);
-                layer.attn_ln_0_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_audio_state);
-                layer.attn_ln_0_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_audio_state);
+                layer.attn_ln_0_w = wsp_ggml_new_tensor_1d(ctx, WSP_GGML_TYPE_F32,   n_audio_state);
+                layer.attn_ln_0_b = wsp_ggml_new_tensor_1d(ctx, WSP_GGML_TYPE_F32,   n_audio_state);
-                layer.attn_q_w    = ggml_new_tensor_2d(ctx, wtype,           n_audio_state, n_audio_state);
-                layer.attn_q_b    = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_audio_state);
+                layer.attn_q_w    = wsp_ggml_new_tensor_2d(ctx, wtype,           n_audio_state, n_audio_state);
+                layer.attn_q_b    = wsp_ggml_new_tensor_1d(ctx, WSP_GGML_TYPE_F32,   n_audio_state);
-                layer.attn_k_w    = ggml_new_tensor_2d(ctx, wtype,           n_audio_state, n_audio_state);
+                layer.attn_k_w    = wsp_ggml_new_tensor_2d(ctx, wtype,           n_audio_state, n_audio_state);
-                layer.attn_v_w    = ggml_new_tensor_2d(ctx, wtype,           n_audio_state, n_audio_state);
-                layer.attn_v_b    = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_audio_state);
+                layer.attn_v_w    = wsp_ggml_new_tensor_2d(ctx, wtype,           n_audio_state, n_audio_state);
+                layer.attn_v_b    = wsp_ggml_new_tensor_1d(ctx, WSP_GGML_TYPE_F32,   n_audio_state);
-                layer.attn_ln_1_w = ggml_new_tensor_2d(ctx, wtype,           n_audio_state, n_audio_state);
-                layer.attn_ln_1_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_audio_state);
+                layer.attn_ln_1_w = wsp_ggml_new_tensor_2d(ctx, wtype,           n_audio_state, n_audio_state);
+                layer.attn_ln_1_b = wsp_ggml_new_tensor_1d(ctx, WSP_GGML_TYPE_F32,   n_audio_state);
                 // map by name
                 model.tensors["encoder.blocks." + std::to_string(i) + ".mlp_ln.weight"]     = layer.mlp_ln_w;
@@ -1261,12 +1261,12 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
         // decoder
         {
-            model.d_pe   = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_text_state, n_text_ctx);
+            model.d_pe   = wsp_ggml_new_tensor_2d(ctx, WSP_GGML_TYPE_F32, n_text_state, n_text_ctx);
-            model.d_te   = ggml_new_tensor_2d(ctx, wtype,         n_text_state, n_vocab);
+            model.d_te   = wsp_ggml_new_tensor_2d(ctx, wtype,         n_text_state, n_vocab);
-            model.d_ln_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state);
-            model.d_ln_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state);
+            model.d_ln_w = wsp_ggml_new_tensor_1d(ctx, WSP_GGML_TYPE_F32, n_text_state);
+            model.d_ln_b = wsp_ggml_new_tensor_1d(ctx, WSP_GGML_TYPE_F32, n_text_state);
             // map by name
             model.tensors["decoder.positional_embedding"]   = model.d_pe;
@@ -1279,42 +1279,42 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
             for (int i = 0; i < n_text_layer; ++i) {
                 auto & layer = model.layers_decoder[i];
-                layer.mlp_ln_w          = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_text_state);
-                layer.mlp_ln_b          = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_text_state);
+                layer.mlp_ln_w          = wsp_ggml_new_tensor_1d(ctx, WSP_GGML_TYPE_F32,   n_text_state);
+                layer.mlp_ln_b          = wsp_ggml_new_tensor_1d(ctx, WSP_GGML_TYPE_F32,   n_text_state);
-                layer.mlp_0_w           = ggml_new_tensor_2d(ctx, wtype,           n_text_state, 4*n_text_state);
-                layer.mlp_0_b           = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 4*n_text_state);
+                layer.mlp_0_w           = wsp_ggml_new_tensor_2d(ctx, wtype,           n_text_state, 4*n_text_state);
+                layer.mlp_0_b           = wsp_ggml_new_tensor_1d(ctx, WSP_GGML_TYPE_F32, 4*n_text_state);
-                layer.mlp_1_w           = ggml_new_tensor_2d(ctx, wtype,         4*n_text_state, n_text_state);
-                layer.mlp_1_b           = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_text_state);
+                layer.mlp_1_w           = wsp_ggml_new_tensor_2d(ctx, wtype,         4*n_text_state, n_text_state);
+                layer.mlp_1_b           = wsp_ggml_new_tensor_1d(ctx, WSP_GGML_TYPE_F32,   n_text_state);
-                layer.attn_ln_0_w       = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_text_state);
-                layer.attn_ln_0_b       = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_text_state);
+                layer.attn_ln_0_w       = wsp_ggml_new_tensor_1d(ctx, WSP_GGML_TYPE_F32,   n_text_state);
+                layer.attn_ln_0_b       = wsp_ggml_new_tensor_1d(ctx, WSP_GGML_TYPE_F32,   n_text_state);
-                layer.attn_q_w          = ggml_new_tensor_2d(ctx, wtype,           n_text_state, n_text_state);
-                layer.attn_q_b          = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_text_state);
+                layer.attn_q_w          = wsp_ggml_new_tensor_2d(ctx, wtype,           n_text_state, n_text_state);
+                layer.attn_q_b          = wsp_ggml_new_tensor_1d(ctx, WSP_GGML_TYPE_F32,   n_text_state);
-                layer.attn_k_w          = ggml_new_tensor_2d(ctx, wtype,           n_text_state, n_text_state);
+                layer.attn_k_w          = wsp_ggml_new_tensor_2d(ctx, wtype,           n_text_state, n_text_state);
-                layer.attn_v_w          = ggml_new_tensor_2d(ctx, wtype,           n_text_state, n_text_state);
-                layer.attn_v_b          = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_text_state);
+                layer.attn_v_w          = wsp_ggml_new_tensor_2d(ctx, wtype,           n_text_state, n_text_state);
+                layer.attn_v_b          = wsp_ggml_new_tensor_1d(ctx, WSP_GGML_TYPE_F32,   n_text_state);
-                layer.attn_ln_1_w       = ggml_new_tensor_2d(ctx, wtype,           n_text_state, n_text_state);
-                layer.attn_ln_1_b       = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_text_state);
+                layer.attn_ln_1_w       = wsp_ggml_new_tensor_2d(ctx, wtype,           n_text_state, n_text_state);
+                layer.attn_ln_1_b       = wsp_ggml_new_tensor_1d(ctx, WSP_GGML_TYPE_F32,   n_text_state);
-                layer.cross_attn_ln_0_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_text_state);
-                layer.cross_attn_ln_0_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_text_state);
+                layer.cross_attn_ln_0_w = wsp_ggml_new_tensor_1d(ctx, WSP_GGML_TYPE_F32,   n_text_state);
+                layer.cross_attn_ln_0_b = wsp_ggml_new_tensor_1d(ctx, WSP_GGML_TYPE_F32,   n_text_state);
-                layer.cross_attn_q_w    = ggml_new_tensor_2d(ctx, wtype,           n_text_state, n_text_state);
-                layer.cross_attn_q_b    = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_text_state);
+                layer.cross_attn_q_w    = wsp_ggml_new_tensor_2d(ctx, wtype,           n_text_state, n_text_state);
+                layer.cross_attn_q_b    = wsp_ggml_new_tensor_1d(ctx, WSP_GGML_TYPE_F32,   n_text_state);
-                layer.cross_attn_k_w    = ggml_new_tensor_2d(ctx, wtype,           n_text_state, n_text_state);
+                layer.cross_attn_k_w    = wsp_ggml_new_tensor_2d(ctx, wtype,           n_text_state, n_text_state);
-                layer.cross_attn_v_w    = ggml_new_tensor_2d(ctx, wtype,           n_text_state, n_text_state);
-                layer.cross_attn_v_b    = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_text_state);
+                layer.cross_attn_v_w    = wsp_ggml_new_tensor_2d(ctx, wtype,           n_text_state, n_text_state);
+                layer.cross_attn_v_b    = wsp_ggml_new_tensor_1d(ctx, WSP_GGML_TYPE_F32,   n_text_state);
-                layer.cross_attn_ln_1_w = ggml_new_tensor_2d(ctx, wtype,           n_text_state, n_text_state);
-                layer.cross_attn_ln_1_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_text_state);
+                layer.cross_attn_ln_1_w = wsp_ggml_new_tensor_2d(ctx, wtype,           n_text_state, n_text_state);
+                layer.cross_attn_ln_1_b = wsp_ggml_new_tensor_1d(ctx, WSP_GGML_TYPE_F32,   n_text_state);
                 // map by name
                 model.tensors["decoder.blocks." + std::to_string(i) + ".mlp_ln.weight"]           = layer.mlp_ln_w;
@@ -1394,7 +1394,7 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
             }
             auto tensor = model.tensors[name.data()];
-            if (ggml_nelements(tensor) != nelements) {
+            if (wsp_ggml_nelements(tensor) != nelements) {
                 log("%s: tensor '%s' has wrong size in model file\n", __func__, name.data());
                 log("%s: shape: [%d, %d, %d], expected: [%d, %d, %d]\n",
                         __func__, ne[0], ne[1], ne[2], (int) tensor->ne[0], (int) tensor->ne[1], (int) tensor->ne[2]);
@@ -1407,19 +1407,19 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
                 return false;
             }
-            const size_t bpe = ggml_type_size(ggml_type(ttype));
+            const size_t bpe = wsp_ggml_type_size(wsp_ggml_type(ttype));
-            if ((nelements*bpe)/ggml_blck_size(tensor->type) != ggml_nbytes(tensor)) {
+            if ((nelements*bpe)/wsp_ggml_blck_size(tensor->type) != wsp_ggml_nbytes(tensor)) {
                 log("%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n",
-                        __func__, name.data(), ggml_nbytes(tensor), nelements*bpe);
+                        __func__, name.data(), wsp_ggml_nbytes(tensor), nelements*bpe);
                 return false;
             }
-            loader->read(loader->context, tensor->data, ggml_nbytes(tensor));
+            loader->read(loader->context, tensor->data, wsp_ggml_nbytes(tensor));
             BYTESWAP_TENSOR(tensor);
-            //printf("%48s - [%5d, %5d, %5d], type = %6s, %6.2f MB\n", name.data(), ne[0], ne[1], ne[2], ggml_type_name((ggml_type) ttype), ggml_nbytes(tensor)/1024.0/1024.0);
-            total_size += ggml_nbytes(tensor);
+            //printf("%48s - [%5d, %5d, %5d], type = %6s, %6.2f MB\n", name.data(), ne[0], ne[1], ne[2], wsp_ggml_type_name((wsp_ggml_type) ttype), wsp_ggml_nbytes(tensor)/1024.0/1024.0);
+            total_size += wsp_ggml_nbytes(tensor);
             model.n_loaded++;
         }
@@ -1433,7 +1433,7 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
         }
     }
-    wctx.t_load_us = ggml_time_us() - t_start_us;
+    wctx.t_load_us = wsp_ggml_time_us() - t_start_us;
     return true;
 }
@@ -1454,7 +1454,7 @@ static bool whisper_encode_internal(
               const int   mel_offset,
               const int   n_threads){
-    const int64_t t_start_us = ggml_time_us();
+    const int64_t t_start_us = wsp_ggml_time_us();
     const auto & model   = wctx.model;
     const auto & mel_inp = wstate.mel;
@@ -1468,21 +1468,21 @@ static bool whisper_encode_internal(
     const int n_mels = hparams.n_mels;
     assert(mel_inp.n_mel == n_mels);
-    struct ggml_init_params params = {
+    struct wsp_ggml_init_params params = {
         /*.mem_size   =*/ wstate.buf_compute.size(),
         /*.mem_buffer =*/ wstate.buf_compute.data(),
         /*.no_alloc   =*/ false,
     };
-    struct ggml_context * ctx0 = ggml_init(params);
+    struct wsp_ggml_context * ctx0 = wsp_ggml_init(params);
     wstate.use_buf(ctx0, 0);
-    struct ggml_tensor * mel = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, 2*n_ctx, n_mels);
-    assert(mel->type == GGML_TYPE_F32);
+    struct wsp_ggml_tensor * mel = wsp_ggml_new_tensor_2d(ctx0, WSP_GGML_TYPE_F32, 2*n_ctx, n_mels);
+    assert(mel->type == WSP_GGML_TYPE_F32);
     {
         float * dst = (float *) mel->data;
-        memset(dst, 0, ggml_nbytes(mel));
+        memset(dst, 0, wsp_ggml_nbytes(mel));
         const int i0 = std::min(mel_offset, mel_inp.n_len);
         const int i1 = std::min(mel_offset + 2*n_ctx, mel_inp.n_len);
@@ -1494,7 +1494,7 @@ static bool whisper_encode_internal(
         }
     }
-    struct ggml_tensor * cur;
+    struct wsp_ggml_tensor * cur;
 #ifndef WHISPER_USE_COREML
     const bool use_coreml = false;
@@ -1513,25 +1513,25 @@ static bool whisper_encode_internal(
         {
             wstate.use_buf(ctx0, 1);
-            cur = ggml_conv_1d_ph(ctx0, model.e_conv_1_w, mel, 1, 1);
-            cur = ggml_add(ctx0,
-                    ggml_repeat(ctx0,
+            cur = wsp_ggml_conv_1d_ph(ctx0, model.e_conv_1_w, mel, 1, 1);
+            cur = wsp_ggml_add(ctx0,
+                    wsp_ggml_repeat(ctx0,
                         model.e_conv_1_b,
                         cur),
                     cur);
-            cur = ggml_gelu(ctx0, cur);
+            cur = wsp_ggml_gelu(ctx0, cur);
             wstate.use_buf(ctx0, 0);
-            cur = ggml_conv_1d_ph(ctx0, model.e_conv_2_w, cur, 2, 1);
-            cur = ggml_add(ctx0,
-                    ggml_repeat(ctx0,
+            cur = wsp_ggml_conv_1d_ph(ctx0, model.e_conv_2_w, cur, 2, 1);
+            cur = wsp_ggml_add(ctx0,
+                    wsp_ggml_repeat(ctx0,
                         model.e_conv_2_b,
                         cur),
                     cur);
-            cur = ggml_gelu(ctx0, cur);
+            cur = wsp_ggml_gelu(ctx0, cur);
         }
         wstate.use_buf(ctx0, 3);
@@ -1544,25 +1544,25 @@ static bool whisper_encode_internal(
         //iter = (iter + 1) % n_iter;
         //if (iter == 0) {
-        //    memset(model.memory_cross_k->data, 0, ggml_nbytes(model.memory_cross_k));
-        //    memset(model.memory_cross_v->data, 0, ggml_nbytes(model.memory_cross_v));
+        //    memset(model.memory_cross_k->data, 0, wsp_ggml_nbytes(model.memory_cross_k));
+        //    memset(model.memory_cross_v->data, 0, wsp_ggml_nbytes(model.memory_cross_v));
         //}
         static int iter = 0;
-        const size_t e_pe_stride = model.e_pe->ne[0]*ggml_element_size(model.e_pe);
-        const size_t e_pe_offset = model.e_pe->ne[0]*ggml_element_size(model.e_pe)*n_ctx*iter;
+        const size_t e_pe_stride = model.e_pe->ne[0]*wsp_ggml_element_size(model.e_pe);
+        const size_t e_pe_offset = model.e_pe->ne[0]*wsp_ggml_element_size(model.e_pe)*n_ctx*iter;
-        struct ggml_tensor * e_pe = ggml_view_2d(ctx0, model.e_pe, model.e_pe->ne[0], n_ctx, e_pe_stride, e_pe_offset);
+        struct wsp_ggml_tensor * e_pe = wsp_ggml_view_2d(ctx0, model.e_pe, model.e_pe->ne[0], n_ctx, e_pe_stride, e_pe_offset);
-        cur = ggml_add(ctx0, e_pe, ggml_transpose(ctx0, cur));
+        cur = wsp_ggml_add(ctx0, e_pe, wsp_ggml_transpose(ctx0, cur));
         // ===================================================================
         // original:
-        //cur = ggml_add(ctx0, model.e_pe, ggml_transpose(ctx0, cur));
+        //cur = wsp_ggml_add(ctx0, model.e_pe, wsp_ggml_transpose(ctx0, cur));
-        struct ggml_tensor * inpL = cur;
+        struct wsp_ggml_tensor * inpL = cur;
         for (int il = 0; il < n_layer; ++il) {
             const auto & layer = model.layers_encoder[il];
@@ -1571,45 +1571,45 @@ static bool whisper_encode_internal(
             {
                 wstate.use_buf(ctx0, 0);
-                cur = ggml_norm(ctx0, inpL);
+                cur = wsp_ggml_norm(ctx0, inpL);
                 // cur = ln_0_w*cur + ln_0_b
-                cur = ggml_add(ctx0,
-                        ggml_mul(ctx0,
-                            ggml_repeat(ctx0, layer.attn_ln_0_w, cur),
+                cur = wsp_ggml_add(ctx0,
+                        wsp_ggml_mul(ctx0,
+                            wsp_ggml_repeat(ctx0, layer.attn_ln_0_w, cur),
                             cur),
-                        ggml_repeat(ctx0, layer.attn_ln_0_b, cur));
+                        wsp_ggml_repeat(ctx0, layer.attn_ln_0_b, cur));
             }
             // self-attention
             {
                 wstate.use_buf(ctx0, 1);
-                struct ggml_tensor * Qcur = ggml_mul_mat(ctx0,
+                struct wsp_ggml_tensor * Qcur = wsp_ggml_mul_mat(ctx0,
                         layer.attn_q_w,
                         cur);
-                Qcur = ggml_add(ctx0,
-                        ggml_repeat(ctx0,
+                Qcur = wsp_ggml_add(ctx0,
+                        wsp_ggml_repeat(ctx0,
                             layer.attn_q_b,
                             Qcur),
                         Qcur);
-                //Qcur = ggml_scale_inplace(ctx0, Qcur, ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25)));
+                //Qcur = wsp_ggml_scale_inplace(ctx0, Qcur, wsp_ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25)));
                 // note: no bias for Key
-                struct ggml_tensor * Kcur = ggml_mul_mat(ctx0,
+                struct wsp_ggml_tensor * Kcur = wsp_ggml_mul_mat(ctx0,
                         layer.attn_k_w,
                         cur);
-                //Kcur = ggml_scale_inplace(ctx0, Kcur, ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25)));
+                //Kcur = wsp_ggml_scale_inplace(ctx0, Kcur, wsp_ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25)));
-                struct ggml_tensor * Vcur = ggml_mul_mat(ctx0,
+                struct wsp_ggml_tensor * Vcur = wsp_ggml_mul_mat(ctx0,
                         layer.attn_v_w,
                         cur);
-                Vcur = ggml_add(ctx0,
-                        ggml_repeat(ctx0,
+                Vcur = wsp_ggml_add(ctx0,
+                        wsp_ggml_repeat(ctx0,
                             layer.attn_v_b,
                             Vcur),
                         Vcur);
@@ -1619,98 +1619,98 @@ static bool whisper_encode_internal(
                 wstate.use_buf(ctx0, 0);
 #ifdef WHISPER_USE_FLASH_ATTN
-                struct ggml_tensor * Q =
-                    ggml_permute(ctx0,
-                            ggml_cpy(ctx0,
+                struct wsp_ggml_tensor * Q =
+                    wsp_ggml_permute(ctx0,
+                            wsp_ggml_cpy(ctx0,
                                 Qcur,
-                                ggml_new_tensor_3d(ctx0, wctx.itype, n_state/n_head, n_head, n_ctx)),
+                                wsp_ggml_new_tensor_3d(ctx0, wctx.itype, n_state/n_head, n_head, n_ctx)),
                             0, 2, 1, 3);
-                struct ggml_tensor * K =
-                    ggml_permute(ctx0,
-                            ggml_cpy(ctx0,
+                struct wsp_ggml_tensor * K =
+                    wsp_ggml_permute(ctx0,
+                            wsp_ggml_cpy(ctx0,
                                 Kcur,
-                                ggml_new_tensor_3d(ctx0, wctx.itype, n_state/n_head, n_head, n_ctx)),
+                                wsp_ggml_new_tensor_3d(ctx0, wctx.itype, n_state/n_head, n_head, n_ctx)),
                             0, 2, 1, 3);
-                struct ggml_tensor * V =
-                    ggml_cpy(ctx0,
-                            ggml_permute(ctx0,
-                                ggml_reshape_3d(ctx0,
+                struct wsp_ggml_tensor * V =
+                    wsp_ggml_cpy(ctx0,
+                            wsp_ggml_permute(ctx0,
+                                wsp_ggml_reshape_3d(ctx0,
                                     Vcur,
                                     n_state/n_head, n_head, n_ctx),
                                 1, 2, 0, 3),
-                            ggml_new_tensor_3d(ctx0, wctx.itype, n_ctx, n_state/n_head, n_head));
+                            wsp_ggml_new_tensor_3d(ctx0, wctx.itype, n_ctx, n_state/n_head, n_head));
-                struct ggml_tensor * KQV = ggml_flash_attn(ctx0, Q, K, V, false);
+                struct wsp_ggml_tensor * KQV = wsp_ggml_flash_attn(ctx0, Q, K, V, false);
 #else
-                struct ggml_tensor * Q =
-                    ggml_permute(ctx0,
-                            ggml_cpy(ctx0,
+                struct wsp_ggml_tensor * Q =
+                    wsp_ggml_permute(ctx0,
+                            wsp_ggml_cpy(ctx0,
                                 Qcur,
-                                ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_state/n_head, n_head, n_ctx)),
+                                wsp_ggml_new_tensor_3d(ctx0, WSP_GGML_TYPE_F32, n_state/n_head, n_head, n_ctx)),
                             0, 2, 1, 3);
-                struct ggml_tensor * K =
-                    ggml_permute(ctx0,
-                            ggml_cpy(ctx0,
+                struct wsp_ggml_tensor * K =
+                    wsp_ggml_permute(ctx0,
+                            wsp_ggml_cpy(ctx0,
                                 Kcur,
-                                ggml_new_tensor_3d(ctx0, wctx.itype, n_state/n_head, n_head, n_ctx)),
+                                wsp_ggml_new_tensor_3d(ctx0, wctx.itype, n_state/n_head, n_head, n_ctx)),
                             0, 2, 1, 3);
                 // K * Q
-                struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
+                struct wsp_ggml_tensor * KQ = wsp_ggml_mul_mat(ctx0, K, Q);
-                struct ggml_tensor * KQ_scaled =
-                    ggml_scale_inplace(ctx0,
+                struct wsp_ggml_tensor * KQ_scaled =
+                    wsp_ggml_scale_inplace(ctx0,
                             KQ,
-                            ggml_new_f32(ctx0, 1.0f/sqrt(float(n_state)/n_head))
+                            wsp_ggml_new_f32(ctx0, 1.0f/sqrt(float(n_state)/n_head))
                             );
-                struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_scaled);
+                struct wsp_ggml_tensor * KQ_soft_max = wsp_ggml_soft_max_inplace(ctx0, KQ_scaled);
-                struct ggml_tensor * V =
-                    ggml_cpy(ctx0,
-                            ggml_permute(ctx0,
-                                ggml_reshape_3d(ctx0,
+                struct wsp_ggml_tensor * V =
+                    wsp_ggml_cpy(ctx0,
+                            wsp_ggml_permute(ctx0,
+                                wsp_ggml_reshape_3d(ctx0,
                                     Vcur,
                                     n_state/n_head, n_head, n_ctx),
                                 1, 2, 0, 3),
-                            ggml_new_tensor_3d(ctx0, wctx.itype, n_ctx, n_state/n_head, n_head)
+                            wsp_ggml_new_tensor_3d(ctx0, wctx.itype, n_ctx, n_state/n_head, n_head)
                             );
-                struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
+                struct wsp_ggml_tensor * KQV = wsp_ggml_mul_mat(ctx0, V, KQ_soft_max);
 #endif
-                struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
+                struct wsp_ggml_tensor * KQV_merged = wsp_ggml_permute(ctx0, KQV, 0, 2, 1, 3);
                 wstate.use_buf(ctx0, 1);
-                cur = ggml_cpy(ctx0,
+                cur = wsp_ggml_cpy(ctx0,
                         KQV_merged,
-                        ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_state, n_ctx));
+                        wsp_ggml_new_tensor_2d(ctx0, WSP_GGML_TYPE_F32, n_state, n_ctx));
             }
             // projection
             {
                 wstate.use_buf(ctx0, 0);
-                cur = ggml_mul_mat(ctx0,
+                cur = wsp_ggml_mul_mat(ctx0,
                         layer.attn_ln_1_w,
                         cur);
                 wstate.use_buf(ctx0, 1);
-                cur = ggml_add(ctx0,
-                        ggml_repeat(ctx0, layer.attn_ln_1_b, cur),
+                cur = wsp_ggml_add(ctx0,
+                        wsp_ggml_repeat(ctx0, layer.attn_ln_1_b, cur),
                         cur);
             }
             wstate.use_buf(ctx0, 2);
             // add the input
-            cur = ggml_add(ctx0, cur, inpL);
+            cur = wsp_ggml_add(ctx0, cur, inpL);
-            struct ggml_tensor * inpFF = cur;
+            struct wsp_ggml_tensor * inpFF = cur;
             // feed-forward network
             {
@@ -1718,61 +1718,61 @@ static bool whisper_encode_internal(
                 {
                     wstate.use_buf(ctx0, 0);
-                    cur = ggml_norm(ctx0, inpFF);
+                    cur = wsp_ggml_norm(ctx0, inpFF);
                     wstate.use_buf(ctx0, 1);
                     // cur = mlp_ln_w*cur + mlp_ln_b
-                    cur = ggml_add(ctx0,
-                            ggml_mul(ctx0,
-                                ggml_repeat(ctx0, layer.mlp_ln_w, cur),
+                    cur = wsp_ggml_add(ctx0,
+                            wsp_ggml_mul(ctx0,
+                                wsp_ggml_repeat(ctx0, layer.mlp_ln_w, cur),
                                 cur),
-                            ggml_repeat(ctx0, layer.mlp_ln_b, cur));
+                            wsp_ggml_repeat(ctx0, layer.mlp_ln_b, cur));
                 }
 #ifdef WHISPER_USE_FLASH_FF
                 wstate.use_buf(ctx0, 0);
-                cur = ggml_flash_ff(ctx0,
-                        ggml_cpy(ctx0, cur, ggml_new_tensor_2d(ctx0, wstate.itype, n_state, n_ctx)),
+                cur = wsp_ggml_flash_ff(ctx0,
+                        wsp_ggml_cpy(ctx0, cur, wsp_ggml_new_tensor_2d(ctx0, wstate.itype, n_state, n_ctx)),
                         layer.mlp_0_w, layer.mlp_0_b, layer.mlp_1_w, layer.mlp_1_b);
 #else
                 wstate.use_buf(ctx0, 0);
                 // fully connected
-                cur = ggml_mul_mat(ctx0,
+                cur = wsp_ggml_mul_mat(ctx0,
                         layer.mlp_0_w,
                         cur);
                 wstate.use_buf(ctx0, 1);
-                cur = ggml_add(ctx0,
-                        ggml_repeat(ctx0, layer.mlp_0_b, cur),
+                cur = wsp_ggml_add(ctx0,
+                        wsp_ggml_repeat(ctx0, layer.mlp_0_b, cur),
                         cur);
                 wstate.use_buf(ctx0, 0);
                 // GELU activation
-                cur = ggml_gelu(ctx0, cur);
+                cur = wsp_ggml_gelu(ctx0, cur);
                 wstate.use_buf(ctx0, 1);
                 // projection
-                cur = ggml_mul_mat(ctx0,
+                cur = wsp_ggml_mul_mat(ctx0,
                         layer.mlp_1_w,
                         cur);
                 wstate.use_buf(ctx0, 0);
-                cur = ggml_add(ctx0,
-                        ggml_repeat(ctx0, layer.mlp_1_b, cur),
+                cur = wsp_ggml_add(ctx0,
+                        wsp_ggml_repeat(ctx0, layer.mlp_1_b, cur),
                         cur);
 #endif
             }
             wstate.use_buf(ctx0, 3);
-            inpL = ggml_add(ctx0, cur, inpFF);
+            inpL = wsp_ggml_add(ctx0, cur, inpFF);
         }
         cur = inpL;
@@ -1781,36 +1781,36 @@ static bool whisper_encode_internal(
         {
             wstate.use_buf(ctx0, 0);
-            cur = ggml_norm(ctx0, cur);
+            cur = wsp_ggml_norm(ctx0, cur);
             wstate.use_buf(ctx0, 1);
             // cur = ln_f_g*cur + ln_f_b
-            cur = ggml_add(ctx0,
-                    ggml_mul(ctx0,
-                        ggml_repeat(ctx0, model.e_ln_w, cur),
+            cur = wsp_ggml_add(ctx0,
+                    wsp_ggml_mul(ctx0,
+                        wsp_ggml_repeat(ctx0, model.e_ln_w, cur),
                         cur),
-                    ggml_repeat(ctx0, model.e_ln_b, cur));
+                    wsp_ggml_repeat(ctx0, model.e_ln_b, cur));
         }
         wstate.use_buf(ctx0, -1);
         // run the computation
         {
-            struct ggml_cgraph gf = {};
+            struct wsp_ggml_cgraph gf = {};
             gf.n_threads = n_threads;
-            ggml_build_forward_expand(&gf, cur);
-            ggml_graph_compute(ctx0, &gf);
+            wsp_ggml_build_forward_expand(&gf, cur);
+            wsp_ggml_graph_compute(ctx0, &gf);
-            //ggml_graph_print(&gf);
+            //wsp_ggml_graph_print(&gf);
         }
     }
 #ifdef WHISPER_USE_COREML
     else if (use_coreml) {
         wstate.use_buf(ctx0, -1);
-        cur = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_state, n_ctx);
+        cur = wsp_ggml_new_tensor_2d(ctx0, WSP_GGML_TYPE_F32, n_state, n_ctx);
         whisper_coreml_encode(wstate.ctx_coreml, (float *) mel->data, (float *) cur->data);
     }
@@ -1819,7 +1819,7 @@ static bool whisper_encode_internal(
     else if (use_openvino) {
         wstate.use_buf(ctx0, -1);
-        cur = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_state, n_ctx);
+        cur = wsp_ggml_new_tensor_2d(ctx0, WSP_GGML_TYPE_F32, n_state, n_ctx);
         if (!whisper_openvino_encode(wstate.ctx_openvino, mel, cur)) {
             return false;
@@ -1843,11 +1843,11 @@ static bool whisper_encode_internal(
     // pre-compute cross-attention memory
     {
-        struct ggml_cgraph gf = {};
+        struct wsp_ggml_cgraph gf = {};
         gf.n_threads = n_threads;
         // TODO: hack to disconnect the encoded features from the previous graph
-        cur->op = GGML_OP_NONE;
+        cur->op = WSP_GGML_OP_NONE;
         cur->src0 = nullptr;
         cur->src1 = nullptr;
@@ -1856,53 +1856,53 @@ static bool whisper_encode_internal(
             wstate.use_buf(ctx0, 0);
-            struct ggml_tensor* Kcross = ggml_mul_mat(ctx0,
+            struct wsp_ggml_tensor* Kcross = wsp_ggml_mul_mat(ctx0,
                 layer.cross_attn_k_w,
                 cur);
-            Kcross = ggml_scale_inplace(ctx0, Kcross, ggml_new_f32(ctx0, pow(float(n_state) / n_head, -0.25)));
+            Kcross = wsp_ggml_scale_inplace(ctx0, Kcross, wsp_ggml_new_f32(ctx0, pow(float(n_state) / n_head, -0.25)));
             wstate.use_buf(ctx0, 1);
-            struct ggml_tensor* Vcross = ggml_mul_mat(ctx0,
+            struct wsp_ggml_tensor* Vcross = wsp_ggml_mul_mat(ctx0,
                 layer.cross_attn_v_w,
                 cur);
-            Vcross = ggml_add(ctx0,
-                ggml_repeat(ctx0,
+            Vcross = wsp_ggml_add(ctx0,
+                wsp_ggml_repeat(ctx0,
                     layer.cross_attn_v_b,
                     Vcross),
                 Vcross);
             wstate.use_buf(ctx0, -1);
-            Vcross = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, Vcross, n_state, n_ctx));
+            Vcross = wsp_ggml_transpose(ctx0, wsp_ggml_reshape_2d(ctx0, Vcross, n_state, n_ctx));
-            struct ggml_tensor * k = ggml_view_1d(ctx0, wstate.kv_cross.k, n_state*n_ctx, (ggml_element_size(wstate.kv_cross.k)*n_state)*(il*n_ctx));
-            struct ggml_tensor * v = ggml_view_2d(ctx0, wstate.kv_cross.v, n_ctx, n_state,
-                    (   n_ctx)*ggml_element_size(wstate.kv_cross.v),
-                    (il*n_ctx)*ggml_element_size(wstate.kv_cross.v)*n_state);
+            struct wsp_ggml_tensor * k = wsp_ggml_view_1d(ctx0, wstate.kv_cross.k, n_state*n_ctx, (wsp_ggml_element_size(wstate.kv_cross.k)*n_state)*(il*n_ctx));
+            struct wsp_ggml_tensor * v = wsp_ggml_view_2d(ctx0, wstate.kv_cross.v, n_ctx, n_state,
+                    (   n_ctx)*wsp_ggml_element_size(wstate.kv_cross.v),
+                    (il*n_ctx)*wsp_ggml_element_size(wstate.kv_cross.v)*n_state);
-            ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Kcross, k));
-            ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Vcross, v));
+            wsp_ggml_build_forward_expand(&gf, wsp_ggml_cpy(ctx0, Kcross, k));
+            wsp_ggml_build_forward_expand(&gf, wsp_ggml_cpy(ctx0, Vcross, v));
         }
-        ggml_graph_compute(ctx0, &gf);
-        //ggml_graph_print(&gf);
+        wsp_ggml_graph_compute(ctx0, &gf);
+        //wsp_ggml_graph_print(&gf);
     }
     ////////////////////////////////////////////////////////////////////////////
     //printf("%s: used_mem = %f MB, %f MB, %f MB %f MB %f MB\n", __func__,
-    //        ggml_used_mem(ctx0)/1024.0/1024.0,
+    //        wsp_ggml_used_mem(ctx0)/1024.0/1024.0,
     //        wstate.get_buf_max_mem(0)/1024.0/1024.0,
     //        wstate.get_buf_max_mem(1)/1024.0/1024.0,
     //        wstate.get_buf_max_mem(2)/1024.0/1024.0,
     //        wstate.get_buf_max_mem(3)/1024.0/1024.0);
-    ggml_free(ctx0);
+    wsp_ggml_free(ctx0);
-    wstate.t_encode_us += ggml_time_us() - t_start_us;
+    wstate.t_encode_us += wsp_ggml_time_us() - t_start_us;
     wstate.n_encode++;
     return true;
@@ -1926,7 +1926,7 @@ static bool whisper_decode_internal(
               const int   n_tokens,
               const int   n_past,
               const int   n_threads) {
-    const int64_t t_start_us = ggml_time_us();
+    const int64_t t_start_us = wsp_ggml_time_us();
     const auto & model   = wctx.model;
     const auto & hparams = model.hparams;
@@ -1949,21 +1949,21 @@ static bool whisper_decode_internal(
     //WHISPER_PRINT_DEBUG("%s: n_past = %d, N = %d, M = %d, n_ctx = %d\n", __func__, n_past, N, M, n_ctx);
-    struct ggml_init_params params = {
+    struct wsp_ggml_init_params params = {
         /*.mem_size   =*/ wstate.buf_compute.size(),
         /*.mem_buffer =*/ wstate.buf_compute.data(),
         /*.no_alloc   =*/ false,
     };
-    struct ggml_context * ctx0 = ggml_init(params);
+    struct wsp_ggml_context * ctx0 = wsp_ggml_init(params);
-    struct ggml_cgraph gf = {};
+    struct wsp_ggml_cgraph gf = {};
     gf.n_threads = n_threads;
-    struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
-    memcpy(embd->data, tokens, N*ggml_element_size(embd));
+    struct wsp_ggml_tensor * embd = wsp_ggml_new_tensor_1d(ctx0, WSP_GGML_TYPE_I32, N);
+    memcpy(embd->data, tokens, N*wsp_ggml_element_size(embd));
-    struct ggml_tensor * position = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
+    struct wsp_ggml_tensor * position = wsp_ggml_new_tensor_1d(ctx0, WSP_GGML_TYPE_I32, N);
     for (int i = 0; i < N; ++i) {
         ((int32_t *) position->data)[i] = n_past + i;
     }
@@ -1971,12 +1971,12 @@ static bool whisper_decode_internal(
     wstate.use_buf(ctx0, 3);
     // token encoding + position encoding
-    struct ggml_tensor * cur =
-        ggml_add(ctx0,
-                ggml_get_rows(ctx0, model.d_te, embd),
-                ggml_get_rows(ctx0, model.d_pe, position));
+    struct wsp_ggml_tensor * cur =
+        wsp_ggml_add(ctx0,
+                wsp_ggml_get_rows(ctx0, model.d_te, embd),
+                wsp_ggml_get_rows(ctx0, model.d_pe, position));
-    struct ggml_tensor * inpL = cur;
+    struct wsp_ggml_tensor * inpL = cur;
     for (int il = 0; il < n_layer; ++il) {
         const auto & layer = model.layers_decoder[il];
@@ -1985,236 +1985,236 @@ static bool whisper_decode_internal(
         {
             wstate.use_buf(ctx0, 0);
-            cur = ggml_norm(ctx0, inpL);
+            cur = wsp_ggml_norm(ctx0, inpL);
             // cur = ln_0_w*cur + ln_0_b
-            cur = ggml_add(ctx0,
-                    ggml_mul(ctx0,
-                        ggml_repeat(ctx0, layer.attn_ln_0_w, cur),
+            cur = wsp_ggml_add(ctx0,
+                    wsp_ggml_mul(ctx0,
+                        wsp_ggml_repeat(ctx0, layer.attn_ln_0_w, cur),
                         cur),
-                    ggml_repeat(ctx0, layer.attn_ln_0_b, cur));
+                    wsp_ggml_repeat(ctx0, layer.attn_ln_0_b, cur));
         }
         // self-attention
         {
-            struct ggml_tensor * Qcur = ggml_mul_mat(ctx0,
+            struct wsp_ggml_tensor * Qcur = wsp_ggml_mul_mat(ctx0,
                     layer.attn_q_w,
                     cur);
-            Qcur = ggml_add(ctx0,
-                    ggml_repeat(ctx0,
+            Qcur = wsp_ggml_add(ctx0,
+                    wsp_ggml_repeat(ctx0,
                         layer.attn_q_b,
                         Qcur),
                     Qcur);
-            Qcur = ggml_scale_inplace(ctx0, Qcur, ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25)));
+            Qcur = wsp_ggml_scale_inplace(ctx0, Qcur, wsp_ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25)));
             // note: no bias for Key
-            struct ggml_tensor * Kcur = ggml_mul_mat(ctx0,
+            struct wsp_ggml_tensor * Kcur = wsp_ggml_mul_mat(ctx0,
                     layer.attn_k_w,
                     cur);
-            Kcur = ggml_scale_inplace(ctx0, Kcur, ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25)));
+            Kcur = wsp_ggml_scale_inplace(ctx0, Kcur, wsp_ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25)));
             // store key and value to memory
             {
-                struct ggml_tensor * Vcur = ggml_mul_mat(ctx0,
+                struct wsp_ggml_tensor * Vcur = wsp_ggml_mul_mat(ctx0,
                         layer.attn_v_w,
                         cur);
-                Vcur = ggml_add(ctx0,
-                        ggml_repeat(ctx0,
+                Vcur = wsp_ggml_add(ctx0,
+                        wsp_ggml_repeat(ctx0,
                             layer.attn_v_b,
                             Vcur),
                         Vcur);
-                Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, Vcur, n_state, N));
+                Vcur = wsp_ggml_transpose(ctx0, wsp_ggml_reshape_2d(ctx0, Vcur, n_state, N));
-                struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_state, (ggml_element_size(kv_self.k)*n_state)*(il*n_ctx + n_past));
-                struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_state,
-                        (   n_ctx)*ggml_element_size(kv_self.v),
-                        (il*n_ctx)*ggml_element_size(kv_self.v)*n_state + n_past*ggml_element_size(kv_self.v));
+                struct wsp_ggml_tensor * k = wsp_ggml_view_1d(ctx0, kv_self.k, N*n_state, (wsp_ggml_element_size(kv_self.k)*n_state)*(il*n_ctx + n_past));
+                struct wsp_ggml_tensor * v = wsp_ggml_view_2d(ctx0, kv_self.v, N, n_state,
+                        (   n_ctx)*wsp_ggml_element_size(kv_self.v),
+                        (il*n_ctx)*wsp_ggml_element_size(kv_self.v)*n_state + n_past*wsp_ggml_element_size(kv_self.v));
-                ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Kcur, k));
-                ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Vcur, v));
+                wsp_ggml_build_forward_expand(&gf, wsp_ggml_cpy(ctx0, Kcur, k));
+                wsp_ggml_build_forward_expand(&gf, wsp_ggml_cpy(ctx0, Vcur, v));
             }
             // ------
             wstate.use_buf(ctx0, 0);
-            struct ggml_tensor * Q =
-                ggml_permute(ctx0,
-                        ggml_cpy(ctx0,
+            struct wsp_ggml_tensor * Q =
+                wsp_ggml_permute(ctx0,
+                        wsp_ggml_cpy(ctx0,
                             Qcur,
-                            ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_state/n_head, n_head, N)),
+                            wsp_ggml_new_tensor_3d(ctx0, WSP_GGML_TYPE_F32, n_state/n_head, n_head, N)),
                         0, 2, 1, 3);
-            struct ggml_tensor * K =
-                ggml_permute(ctx0,
-                        ggml_reshape_3d(ctx0,
-                            ggml_view_1d(ctx0, kv_self.k, (n_past + N)*n_state, il*n_ctx*ggml_element_size(kv_self.k)*n_state),
+            struct wsp_ggml_tensor * K =
+                wsp_ggml_permute(ctx0,
+                        wsp_ggml_reshape_3d(ctx0,
+                            wsp_ggml_view_1d(ctx0, kv_self.k, (n_past + N)*n_state, il*n_ctx*wsp_ggml_element_size(kv_self.k)*n_state),
                             n_state/n_head, n_head, n_past + N),
                         0, 2, 1, 3);
             wstate.use_buf(ctx0, 1);
             // K * Q
-            struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
+            struct wsp_ggml_tensor * KQ = wsp_ggml_mul_mat(ctx0, K, Q);
-            //struct ggml_tensor * KQ_scaled =
-            //    ggml_scale_inplace(ctx0,
+            //struct wsp_ggml_tensor * KQ_scaled =
+            //    wsp_ggml_scale_inplace(ctx0,
             //            KQ,
-            //            ggml_new_f32(ctx0, 1.0f/sqrt(float(n_state)/n_head))
+            //            wsp_ggml_new_f32(ctx0, 1.0f/sqrt(float(n_state)/n_head))
             //            );
-            struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ, n_past);
+            struct wsp_ggml_tensor * KQ_masked = wsp_ggml_diag_mask_inf_inplace(ctx0, KQ, n_past);
-            struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
+            struct wsp_ggml_tensor * KQ_soft_max = wsp_ggml_soft_max_inplace(ctx0, KQ_masked);
-            struct ggml_tensor * V =
-                ggml_view_3d(ctx0, kv_self.v,
+            struct wsp_ggml_tensor * V =
+                wsp_ggml_view_3d(ctx0, kv_self.v,
                         n_past + N, n_state/n_head, n_head,
-                        n_ctx*ggml_element_size(kv_self.v),
-                        n_ctx*ggml_element_size(kv_self.v)*n_state/n_head,
-                        il*n_ctx*ggml_element_size(kv_self.v)*n_state);
+                        n_ctx*wsp_ggml_element_size(kv_self.v),
+                        n_ctx*wsp_ggml_element_size(kv_self.v)*n_state/n_head,
+                        il*n_ctx*wsp_ggml_element_size(kv_self.v)*n_state);
-            struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
+            struct wsp_ggml_tensor * KQV = wsp_ggml_mul_mat(ctx0, V, KQ_soft_max);
-            struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
+            struct wsp_ggml_tensor * KQV_merged = wsp_ggml_permute(ctx0, KQV, 0, 2, 1, 3);
-            cur = ggml_cpy(ctx0,
+            cur = wsp_ggml_cpy(ctx0,
                     KQV_merged,
-                    ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_state, N));
+                    wsp_ggml_new_tensor_2d(ctx0, WSP_GGML_TYPE_F32, n_state, N));
         }
         // projection
         {
             wstate.use_buf(ctx0, 0);
-            cur = ggml_mul_mat(ctx0,
+            cur = wsp_ggml_mul_mat(ctx0,
                     layer.attn_ln_1_w,
                     cur);
             wstate.use_buf(ctx0, 1);
-            cur = ggml_add(ctx0,
-                    ggml_repeat(ctx0, layer.attn_ln_1_b, cur),
+            cur = wsp_ggml_add(ctx0,
+                    wsp_ggml_repeat(ctx0, layer.attn_ln_1_b, cur),
                     cur);
         }
         wstate.use_buf(ctx0, 2);
         // add the input
-        struct ggml_tensor * inpCA = ggml_add(ctx0, cur, inpL);
+        struct wsp_ggml_tensor * inpCA = wsp_ggml_add(ctx0, cur, inpL);
         // norm
         {
             wstate.use_buf(ctx0, 0);
-            cur = ggml_norm(ctx0, inpCA); // note: we use inpCA here
+            cur = wsp_ggml_norm(ctx0, inpCA); // note: we use inpCA here
             // cur = ln_0_w*cur + ln_0_b
-            cur = ggml_add(ctx0,
-                    ggml_mul(ctx0,
-                        ggml_repeat(ctx0, layer.cross_attn_ln_0_w, cur),
+            cur = wsp_ggml_add(ctx0,
+                    wsp_ggml_mul(ctx0,
+                        wsp_ggml_repeat(ctx0, layer.cross_attn_ln_0_w, cur),
                         cur),
-                    ggml_repeat(ctx0, layer.cross_attn_ln_0_b, cur));
+                    wsp_ggml_repeat(ctx0, layer.cross_attn_ln_0_b, cur));
         }
         // cross-attention
         {
-            struct ggml_tensor * Qcur = ggml_mul_mat(ctx0,
+            struct wsp_ggml_tensor * Qcur = wsp_ggml_mul_mat(ctx0,
                     layer.cross_attn_q_w,
                     cur);
-            Qcur = ggml_add(ctx0,
-                    ggml_repeat(ctx0,
+            Qcur = wsp_ggml_add(ctx0,
+                    wsp_ggml_repeat(ctx0,
                         layer.cross_attn_q_b,
                         Qcur),
                     Qcur);
-            Qcur = ggml_scale_inplace(ctx0, Qcur, ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25)));
+            Qcur = wsp_ggml_scale_inplace(ctx0, Qcur, wsp_ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25)));
             // Kcross is already scaled
-            struct ggml_tensor * Kcross =
-                ggml_reshape_3d(ctx0,
-                        ggml_view_1d(ctx0, wstate.kv_cross.k, M*n_state, il*M*ggml_element_size(wstate.kv_cross.k)*n_state),
+            struct wsp_ggml_tensor * Kcross =
+                wsp_ggml_reshape_3d(ctx0,
+                        wsp_ggml_view_1d(ctx0, wstate.kv_cross.k, M*n_state, il*M*wsp_ggml_element_size(wstate.kv_cross.k)*n_state),
                         n_state/n_head, n_head, M);
-            //struct ggml_tensor * Vcross =
-            //    ggml_reshape_3d(ctx0,
-            //            ggml_view_1d(ctx0, wstate.kv_cross.v, M*n_state, il*M*ggml_element_size(wstate.kv_cross.v)*n_state),
+            //struct wsp_ggml_tensor * Vcross =
+            //    wsp_ggml_reshape_3d(ctx0,
+            //            wsp_ggml_view_1d(ctx0, wstate.kv_cross.v, M*n_state, il*M*wsp_ggml_element_size(wstate.kv_cross.v)*n_state),
             //            n_state/n_head, n_head, M);
-            //struct ggml_tensor * V_trans =
-            //    ggml_cpy(ctx0,
-            //            ggml_permute(ctx0, Vcross, 1, 2, 0, 3),
-            //            ggml_new_tensor_3d(ctx0, Vcross->type, M, n_state/n_head, n_head));
+            //struct wsp_ggml_tensor * V_trans =
+            //    wsp_ggml_cpy(ctx0,
+            //            wsp_ggml_permute(ctx0, Vcross, 1, 2, 0, 3),
+            //            wsp_ggml_new_tensor_3d(ctx0, Vcross->type, M, n_state/n_head, n_head));
-            struct ggml_tensor * V =
-                ggml_view_3d(ctx0, wstate.kv_cross.v,
+            struct wsp_ggml_tensor * V =
+                wsp_ggml_view_3d(ctx0, wstate.kv_cross.v,
                         M, n_state/n_head, n_head,
-                        M*ggml_element_size(wstate.kv_cross.v),
-                        M*ggml_element_size(wstate.kv_cross.v)*n_state/n_head,
-                        il*M*ggml_element_size(wstate.kv_cross.v)*n_state);
+                        M*wsp_ggml_element_size(wstate.kv_cross.v),
+                        M*wsp_ggml_element_size(wstate.kv_cross.v)*n_state/n_head,
+                        il*M*wsp_ggml_element_size(wstate.kv_cross.v)*n_state);
             // ------
-            struct ggml_tensor * Q =
-                ggml_permute(ctx0,
-                        ggml_cpy(ctx0,
+            struct wsp_ggml_tensor * Q =
+                wsp_ggml_permute(ctx0,
+                        wsp_ggml_cpy(ctx0,
                             Qcur,
-                            ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_state/n_head, n_head, N)),
+                            wsp_ggml_new_tensor_3d(ctx0, WSP_GGML_TYPE_F32, n_state/n_head, n_head, N)),
                         0, 2, 1, 3);
-            struct ggml_tensor * K = ggml_permute(ctx0, Kcross, 0, 2, 1, 3);
+            struct wsp_ggml_tensor * K = wsp_ggml_permute(ctx0, Kcross, 0, 2, 1, 3);
             // K * Q
-            struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
+            struct wsp_ggml_tensor * KQ = wsp_ggml_mul_mat(ctx0, K, Q);
-            //struct ggml_tensor * KQ_scaled =
-            //    ggml_scale_inplace(ctx0,
+            //struct wsp_ggml_tensor * KQ_scaled =
+            //    wsp_ggml_scale_inplace(ctx0,
             //            KQ,
-            //            ggml_new_f32(ctx0, 1.0f/sqrt(float(n_state)/n_head))
+            //            wsp_ggml_new_f32(ctx0, 1.0f/sqrt(float(n_state)/n_head))
             //            );
             // no masking for cross-attention
-            //struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);
+            //struct wsp_ggml_tensor * KQ_masked = wsp_ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);
-            struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ);
+            struct wsp_ggml_tensor * KQ_soft_max = wsp_ggml_soft_max_inplace(ctx0, KQ);
-            struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
+            struct wsp_ggml_tensor * KQV = wsp_ggml_mul_mat(ctx0, V, KQ_soft_max);
-            struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
+            struct wsp_ggml_tensor * KQV_merged = wsp_ggml_permute(ctx0, KQV, 0, 2, 1, 3);
             // cur = KQV_merged.contiguous().view(n_state, N)
-            cur = ggml_cpy(ctx0,
+            cur = wsp_ggml_cpy(ctx0,
                     KQV_merged,
-                    ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_state, N));
+                    wsp_ggml_new_tensor_2d(ctx0, WSP_GGML_TYPE_F32, n_state, N));
         }
         // projection
         {
             wstate.use_buf(ctx0, 0);
-            cur = ggml_mul_mat(ctx0,
+            cur = wsp_ggml_mul_mat(ctx0,
                     layer.cross_attn_ln_1_w,
                     cur);
             wstate.use_buf(ctx0, 1);
-            cur = ggml_add(ctx0,
-                    ggml_repeat(ctx0, layer.cross_attn_ln_1_b, cur),
+            cur = wsp_ggml_add(ctx0,
+                    wsp_ggml_repeat(ctx0, layer.cross_attn_ln_1_b, cur),
                     cur);
         }
         wstate.use_buf(ctx0, 2);
         // add the input
-        cur = ggml_add(ctx0, cur, inpCA);
+        cur = wsp_ggml_add(ctx0, cur, inpCA);
-        struct ggml_tensor * inpFF = cur;
+        struct wsp_ggml_tensor * inpFF = cur;
         // feed-forward network
         {
@@ -2222,53 +2222,53 @@ static bool whisper_decode_internal(
             {
                 wstate.use_buf(ctx0, 0);
-                cur = ggml_norm(ctx0, inpFF);
+                cur = wsp_ggml_norm(ctx0, inpFF);
                 wstate.use_buf(ctx0, 1);
                 // cur = mlp_ln_w*cur + mlp_ln_b
-                cur = ggml_add(ctx0,
-                        ggml_mul(ctx0,
-                            ggml_repeat(ctx0, layer.mlp_ln_w, cur),
+                cur = wsp_ggml_add(ctx0,
+                        wsp_ggml_mul(ctx0,
+                            wsp_ggml_repeat(ctx0, layer.mlp_ln_w, cur),
                             cur),
-                        ggml_repeat(ctx0, layer.mlp_ln_b, cur));
+                        wsp_ggml_repeat(ctx0, layer.mlp_ln_b, cur));
             }
             wstate.use_buf(ctx0, 0);
             // fully connected
-            cur = ggml_mul_mat(ctx0,
+            cur = wsp_ggml_mul_mat(ctx0,
                     layer.mlp_0_w,
                     cur);
             wstate.use_buf(ctx0, 1);
-            cur = ggml_add(ctx0,
-                    ggml_repeat(ctx0, layer.mlp_0_b, cur),
+            cur = wsp_ggml_add(ctx0,
+                    wsp_ggml_repeat(ctx0, layer.mlp_0_b, cur),
                     cur);
             wstate.use_buf(ctx0, 0);
             // GELU activation
-            cur = ggml_gelu(ctx0, cur);
+            cur = wsp_ggml_gelu(ctx0, cur);
             wstate.use_buf(ctx0, 1);
             // projection
-            cur = ggml_mul_mat(ctx0,
+            cur = wsp_ggml_mul_mat(ctx0,
                     layer.mlp_1_w,
                     cur);
             wstate.use_buf(ctx0, 0);
-            cur = ggml_add(ctx0,
-                    ggml_repeat(ctx0, layer.mlp_1_b, cur),
+            cur = wsp_ggml_add(ctx0,
+                    wsp_ggml_repeat(ctx0, layer.mlp_1_b, cur),
                     cur);
         }
         wstate.use_buf(ctx0, 3);
-        inpL = ggml_add(ctx0, cur, inpFF);
+        inpL = wsp_ggml_add(ctx0, cur, inpFF);
     }
     cur = inpL;
@@ -2277,15 +2277,15 @@ static bool whisper_decode_internal(
     {
         wstate.use_buf(ctx0, 0);
-        cur = ggml_norm(ctx0, cur);
+        cur = wsp_ggml_norm(ctx0, cur);
         wstate.use_buf(ctx0, 1);
-        cur = ggml_add(ctx0,
-                ggml_mul(ctx0,
-                    ggml_repeat(ctx0, model.d_ln_w, cur),
+        cur = wsp_ggml_add(ctx0,
+                wsp_ggml_mul(ctx0,
+                    wsp_ggml_repeat(ctx0, model.d_ln_w, cur),
                     cur),
-                ggml_repeat(ctx0, model.d_ln_b, cur));
+                wsp_ggml_repeat(ctx0, model.d_ln_b, cur));
     }
     wstate.use_buf(ctx0, 0);
@@ -2293,38 +2293,38 @@ static bool whisper_decode_internal(
     // compute logits only for the last token
     // comment this line to compute logits for all N tokens
     // might be useful in the future
-    cur = ggml_view_2d(ctx0, cur, cur->ne[0], 1, cur->nb[1], (cur->ne[1] - 1)*cur->nb[1]);
+    cur = wsp_ggml_view_2d(ctx0, cur, cur->ne[0], 1, cur->nb[1], (cur->ne[1] - 1)*cur->nb[1]);
-    struct ggml_tensor * logits = ggml_mul_mat(ctx0, model.d_te, cur);
+    struct wsp_ggml_tensor * logits = wsp_ggml_mul_mat(ctx0, model.d_te, cur);
     wstate.use_buf(ctx0, -1);
     // run the computation
     {
-        ggml_build_forward_expand(&gf, logits);
-        ggml_graph_compute       (ctx0, &gf);
+        wsp_ggml_build_forward_expand(&gf, logits);
+        wsp_ggml_graph_compute       (ctx0, &gf);
     }
     // extract logits for all N tokens
     //logits_out.resize(N*n_vocab);
-    //memcpy(logits_out.data(), ggml_get_data(logits), sizeof(float)*N*n_vocab);
+    //memcpy(logits_out.data(), wsp_ggml_get_data(logits), sizeof(float)*N*n_vocab);
     // extract logits only for the last token
     logits_out.resize(n_vocab);
-    memcpy(logits_out.data(), ggml_get_data(logits), sizeof(float)*n_vocab);
+    memcpy(logits_out.data(), wsp_ggml_get_data(logits), sizeof(float)*n_vocab);
     if (N > 1) {
         //printf("%s: used_mem = %f MB, %f MB, %f MB %f MB %f MB\n", __func__,
-        //        ggml_used_mem(ctx0)/1024.0/1024.0,
+        //        wsp_ggml_used_mem(ctx0)/1024.0/1024.0,
         //        wstate.get_buf_max_mem(0)/1024.0/1024.0,
         //        wstate.get_buf_max_mem(1)/1024.0/1024.0,
         //        wstate.get_buf_max_mem(2)/1024.0/1024.0,
         //        wstate.get_buf_max_mem(3)/1024.0/1024.0);
     }
-    ggml_free(ctx0);
+    wsp_ggml_free(ctx0);
-    wstate.t_decode_us += ggml_time_us() - t_start_us;
+    wstate.t_decode_us += wsp_ggml_time_us() - t_start_us;
     wstate.n_decode++;
     return true;
@@ -2502,7 +2502,7 @@ static bool log_mel_spectrogram(
   const whisper_filters & filters,
              const bool   speed_up,
             whisper_mel & mel) {
-    const int64_t t_start_us = ggml_time_us();
+    const int64_t t_start_us = wsp_ggml_time_us();
     // Hanning window
     std::vector<float> hann;
@@ -2574,7 +2574,7 @@ static bool log_mel_spectrogram(
         mel.data[i] = (mel.data[i] + 4.0)/4.0;
     }
-    wstate.t_mel_us += ggml_time_us() - t_start_us;
+    wstate.t_mel_us += wsp_ggml_time_us() - t_start_us;
     //printf("mel.n_len() = %d, divided by 1500: %f, n_samples / fft_step: %d\n", mel.n_len, mel.n_len / 1500.0, n_samples / fft_step);
@@ -2705,7 +2705,7 @@ struct whisper_state * whisper_init_state(whisper_context * ctx) {
     }
     {
-        const size_t memory_size = ggml_nbytes(state->decoders[0].kv_self.k) + ggml_nbytes(state->decoders[0].kv_self.v);
+        const size_t memory_size = wsp_ggml_nbytes(state->decoders[0].kv_self.k) + wsp_ggml_nbytes(state->decoders[0].kv_self.v);
         log("%s: kv self size  = %7.2f MB\n", __func__, memory_size / 1024.0 / 1024.0);
     }
@@ -2716,7 +2716,7 @@ struct whisper_state * whisper_init_state(whisper_context * ctx) {
     }
     {
-        const size_t memory_size = ggml_nbytes(state->kv_cross.k) + ggml_nbytes(state->kv_cross.v);
+        const size_t memory_size = wsp_ggml_nbytes(state->kv_cross.k) + wsp_ggml_nbytes(state->kv_cross.v);
         log("%s: kv cross size = %7.2f MB\n", __func__, memory_size / 1024.0 / 1024.0);
     }
@@ -2885,7 +2885,7 @@ struct whisper_context * whisper_init_from_buffer_no_state(void * buffer, size_t
 }
 struct whisper_context * whisper_init_no_state(struct whisper_model_loader * loader) {
-    ggml_time_init();
+    wsp_ggml_time_init();
     whisper_context * ctx = new whisper_context;
@@ -2976,7 +2976,7 @@ void whisper_free_state(struct whisper_state * state)
 void whisper_free(struct whisper_context * ctx) {
     if (ctx) {
         if (ctx->model.ctx) {
-            ggml_free(ctx->model.ctx);
+            wsp_ggml_free(ctx->model.ctx);
         }
         if (ctx->model.buf) {
             delete ctx->model.buf;
@@ -3373,7 +3373,7 @@ whisper_token whisper_token_transcribe(struct whisper_context * ctx) {
 }
 void whisper_print_timings(struct whisper_context * ctx) {
-    const int64_t t_end_us = ggml_time_us();
+    const int64_t t_end_us = wsp_ggml_time_us();
     log("\n");
     log("%s:     load time = %8.2f ms\n", __func__, ctx->t_load_us / 1000.0f);
@@ -3420,18 +3420,18 @@ const char * whisper_print_system_info(void) {
     static std::string s;
     s  = "";
-    s += "AVX = "       + std::to_string(ggml_cpu_has_avx())       + " | ";
-    s += "AVX2 = "      + std::to_string(ggml_cpu_has_avx2())      + " | ";
-    s += "AVX512 = "    + std::to_string(ggml_cpu_has_avx512())    + " | ";
-    s += "FMA = "       + std::to_string(ggml_cpu_has_fma())       + " | ";
-    s += "NEON = "      + std::to_string(ggml_cpu_has_neon())      + " | ";
-    s += "ARM_FMA = "   + std::to_string(ggml_cpu_has_arm_fma())   + " | ";
-    s += "F16C = "      + std::to_string(ggml_cpu_has_f16c())      + " | ";
-    s += "FP16_VA = "   + std::to_string(ggml_cpu_has_fp16_va())   + " | ";
-    s += "WASM_SIMD = " + std::to_string(ggml_cpu_has_wasm_simd()) + " | ";
-    s += "BLAS = "      + std::to_string(ggml_cpu_has_blas())      + " | ";
-    s += "SSE3 = "      + std::to_string(ggml_cpu_has_sse3())      + " | ";
-    s += "VSX = "       + std::to_string(ggml_cpu_has_vsx())       + " | ";
+    s += "AVX = "       + std::to_string(wsp_ggml_cpu_has_avx())       + " | ";
+    s += "AVX2 = "      + std::to_string(wsp_ggml_cpu_has_avx2())      + " | ";
+    s += "AVX512 = "    + std::to_string(wsp_ggml_cpu_has_avx512())    + " | ";
+    s += "FMA = "       + std::to_string(wsp_ggml_cpu_has_fma())       + " | ";
+    s += "NEON = "      + std::to_string(wsp_ggml_cpu_has_neon())      + " | ";
+    s += "ARM_FMA = "   + std::to_string(wsp_ggml_cpu_has_arm_fma())   + " | ";
+    s += "F16C = "      + std::to_string(wsp_ggml_cpu_has_f16c())      + " | ";
+    s += "FP16_VA = "   + std::to_string(wsp_ggml_cpu_has_fp16_va())   + " | ";
+    s += "WASM_SIMD = " + std::to_string(wsp_ggml_cpu_has_wasm_simd()) + " | ";
+    s += "BLAS = "      + std::to_string(wsp_ggml_cpu_has_blas())      + " | ";
+    s += "SSE3 = "      + std::to_string(wsp_ggml_cpu_has_sse3())      + " | ";
+    s += "VSX = "       + std::to_string(wsp_ggml_cpu_has_vsx())       + " | ";
     s += "COREML = "    + std::to_string(whisper_has_coreml())     + " | ";
     s += "OPENVINO = "  + std::to_string(whisper_has_openvino())   + " | ";
@@ -4314,7 +4314,7 @@ int whisper_full_with_state(
                 }
                 {
-                    const int64_t t_start_sample_us = ggml_time_us();
+                    const int64_t t_start_sample_us = wsp_ggml_time_us();
                     whisper_process_logits(*ctx, *state, params, state->decoders[0], t_cur);
@@ -4323,8 +4323,8 @@ int whisper_full_with_state(
                     for (int j = 1; j < n_decoders_cur; ++j) {
                         auto & decoder = state->decoders[j];
-                        memcpy(decoder.kv_self.k->data, state->decoders[0].kv_self.k->data, ggml_nbytes(decoder.kv_self.k));
-                        memcpy(decoder.kv_self.v->data, state->decoders[0].kv_self.v->data, ggml_nbytes(decoder.kv_self.v));
+                        memcpy(decoder.kv_self.k->data, state->decoders[0].kv_self.k->data, wsp_ggml_nbytes(decoder.kv_self.k));
+                        memcpy(decoder.kv_self.v->data, state->decoders[0].kv_self.v->data, wsp_ggml_nbytes(decoder.kv_self.v));
                         decoder.kv_self.n += prompt.size();
@@ -4333,12 +4333,12 @@ int whisper_full_with_state(
                         memcpy(decoder.logprobs.data(), state->decoders[0].logprobs.data(), decoder.logprobs.size()*sizeof(decoder.logprobs[0]));
                     }
-                    state->t_sample_us += ggml_time_us() - t_start_sample_us;
+                    state->t_sample_us += wsp_ggml_time_us() - t_start_sample_us;
                 }
             }
             for (int i = 0, n_max = whisper_n_text_ctx(ctx)/2 - 4; i < n_max; ++i) {
-                const int64_t t_start_sample_us = ggml_time_us();
+                const int64_t t_start_sample_us = wsp_ggml_time_us();
                 // store the KV caches of all decoders when doing beam-search
                 if (params.strategy == whisper_sampling_strategy::WHISPER_SAMPLING_BEAM_SEARCH) {
@@ -4350,8 +4350,8 @@ int whisper_full_with_state(
                             continue;
                         }
-                        kv_bufs[j].k.resize(ggml_nbytes(decoder.kv_self.k));
-                        kv_bufs[j].v.resize(ggml_nbytes(decoder.kv_self.v));
+                        kv_bufs[j].k.resize(wsp_ggml_nbytes(decoder.kv_self.k));
+                        kv_bufs[j].v.resize(wsp_ggml_nbytes(decoder.kv_self.v));
                         memcpy(kv_bufs[j].k.data(), decoder.kv_self.k->data, kv_bufs[j].k.size());
                         memcpy(kv_bufs[j].v.data(), decoder.kv_self.v->data, kv_bufs[j].v.size());
@@ -4531,7 +4531,7 @@ int whisper_full_with_state(
                     }
                 }
-                state->t_sample_us += ggml_time_us() - t_start_sample_us;
+                state->t_sample_us += wsp_ggml_time_us() - t_start_sample_us;
                 // obtain logits for the next token
                 for (int j = 0; j < n_decoders_cur; ++j) {
@@ -4552,13 +4552,13 @@ int whisper_full_with_state(
                     }
                     {
-                        const int64_t t_start_sample_us = ggml_time_us();
+                        const int64_t t_start_sample_us = wsp_ggml_time_us();
                         whisper_process_logits(*ctx, *state, params, decoder, t_cur);
                         ++decoder.kv_self.n;
-                        state->t_sample_us += ggml_time_us() - t_start_sample_us;
+                        state->t_sample_us += wsp_ggml_time_us() - t_start_sample_us;
                     }
                 }
             }
@@ -4980,7 +4980,7 @@ WHISPER_API const char * whisper_bench_memcpy_str(int n_threads) {
     s = "";
     char strbuf[256];
-    ggml_time_init();
+    wsp_ggml_time_init();
     size_t n    = 20;
     size_t arr  = n_threads > 0 ? 1024llu : n_threads; // trick to avoid compiler optimizations
@@ -5001,11 +5001,11 @@ WHISPER_API const char * whisper_bench_memcpy_str(int n_threads) {
         double sum  = 0.0;
         for (size_t i = 0; i < n; i++) {
-            const int64_t t0 = ggml_time_us();
+            const int64_t t0 = wsp_ggml_time_us();
             memcpy(dst, src, size);
-            const int64_t t1 = ggml_time_us();
+            const int64_t t1 = wsp_ggml_time_us();
             tsum += (t1 - t0)*1e-6;
@@ -5030,17 +5030,17 @@ WHISPER_API const char * whisper_bench_memcpy_str(int n_threads) {
     return s.c_str();
 }
-WHISPER_API int whisper_bench_ggml_mul_mat(int n_threads) {
-    fputs(whisper_bench_ggml_mul_mat_str(n_threads), stderr);
+WHISPER_API int whisper_bench_wsp_ggml_mul_mat(int n_threads) {
+    fputs(whisper_bench_wsp_ggml_mul_mat_str(n_threads), stderr);
     return 0;
 }
-WHISPER_API const char * whisper_bench_ggml_mul_mat_str(int n_threads) {
+WHISPER_API const char * whisper_bench_wsp_ggml_mul_mat_str(int n_threads) {
     static std::string s;
     s = "";
     char strbuf[256];
-    ggml_time_init();
+    wsp_ggml_time_init();
     const int n_max = 128;
@@ -5080,45 +5080,45 @@ WHISPER_API const char * whisper_bench_ggml_mul_mat_str(int n_threads) {
         const size_t N = sizes[j];
         for (int k = 0; k < 7; ++k) {
-            const ggml_type wtype =
-                k == 0 ? GGML_TYPE_Q4_0 :
-                k == 1 ? GGML_TYPE_Q4_1 :
-                k == 2 ? GGML_TYPE_Q5_0 :
-                k == 3 ? GGML_TYPE_Q5_1 :
-                k == 4 ? GGML_TYPE_Q8_0 :
-                k == 5 ? GGML_TYPE_F16  : GGML_TYPE_F32;
+            const wsp_ggml_type wtype =
+                k == 0 ? WSP_GGML_TYPE_Q4_0 :
+                k == 1 ? WSP_GGML_TYPE_Q4_1 :
+                k == 2 ? WSP_GGML_TYPE_Q5_0 :
+                k == 3 ? WSP_GGML_TYPE_Q5_1 :
+                k == 4 ? WSP_GGML_TYPE_Q8_0 :
+                k == 5 ? WSP_GGML_TYPE_F16  : WSP_GGML_TYPE_F32;
             double & s = k == 0 ? s_q4_0 : k == 1 ? s_q4_1 : k == 2 ? s_q5_0 : k == 3 ? s_q5_1 : k == 4 ? s_q8_0 : k == 5 ? s_fp16 : /*k == 6*/ s_fp32;
             int    & n = k == 0 ? n_q4_0 : k == 1 ? n_q4_1 : k == 2 ? n_q5_0 : k == 3 ? n_q5_1 : k == 4 ? n_q8_0 : k == 5 ? n_fp16 : /*k == 6*/ n_fp32;
-            struct ggml_init_params gparams = {
+            struct wsp_ggml_init_params gparams = {
                 /*.mem_size   =*/ buf.size(),
                 /*.mem_buffer =*/ buf.data(),
                 /*.no_alloc   =*/ false,
             };
-            struct ggml_context * ctx0 = ggml_init(gparams);
+            struct wsp_ggml_context * ctx0 = wsp_ggml_init(gparams);
-            struct ggml_tensor * a = ggml_new_tensor_2d(ctx0, wtype,         N, N);
-            struct ggml_tensor * b = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, N, N);
+            struct wsp_ggml_tensor * a = wsp_ggml_new_tensor_2d(ctx0, wtype,         N, N);
+            struct wsp_ggml_tensor * b = wsp_ggml_new_tensor_2d(ctx0, WSP_GGML_TYPE_F32, N, N);
-            struct ggml_tensor * c = ggml_mul_mat(ctx0, a, b);
+            struct wsp_ggml_tensor * c = wsp_ggml_mul_mat(ctx0, a, b);
-            struct ggml_cgraph gf = ggml_build_forward(c);
+            struct wsp_ggml_cgraph gf = wsp_ggml_build_forward(c);
             gf.n_threads = n_threads;
             double tsum = 0.0;
             // heat-up
-            ggml_graph_compute(ctx0, &gf);
+            wsp_ggml_graph_compute(ctx0, &gf);
             for (int i = 0; i < n_max; ++i) {
-                const int64_t t0 = ggml_time_us();
+                const int64_t t0 = wsp_ggml_time_us();
-                ggml_graph_compute(ctx0, &gf);
+                wsp_ggml_graph_compute(ctx0, &gf);
-                const int64_t t1 = ggml_time_us();
+                const int64_t t1 = wsp_ggml_time_us();
                 tsum += (t1 - t0)*1e-6;
                 n++;
@@ -5128,7 +5128,7 @@ WHISPER_API const char * whisper_bench_ggml_mul_mat_str(int n_threads) {
                 }
             }
-            ggml_free(ctx0);
+            wsp_ggml_free(ctx0);
             s = ((2.0*N*N*N*n)/tsum)*1e-9;
         }