npm - whisper.rn - Versions diffs - 0.1.4 → 0.2.0 - Mend

whisper.rn 0.1.4 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (25) hide show

package/LICENSE +1 -1
package/README.md +43 -4
package/android/build.gradle +2 -4
package/android/src/main/java/com/rnwhisper/RNWhisperModule.java +47 -7
package/android/src/main/java/com/rnwhisper/WhisperContext.java +196 -7
package/android/src/main/jni/whisper/Whisper.mk +1 -1
package/android/src/main/jni/whisper/jni.cpp +33 -9
package/cpp/rn-whisper.cpp +26 -0
package/cpp/rn-whisper.h +5 -0
package/cpp/whisper.cpp +603 -412
package/cpp/whisper.h +120 -40
package/ios/RNWhisper.h +2 -2
package/ios/RNWhisper.mm +78 -111
package/ios/RNWhisperContext.h +53 -0
package/ios/RNWhisperContext.mm +303 -0
package/jest/mock.js +38 -2
package/lib/commonjs/index.js +63 -2
package/lib/commonjs/index.js.map +1 -1
package/lib/module/index.js +64 -3
package/lib/module/index.js.map +1 -1
package/lib/typescript/index.d.ts +61 -2
package/lib/typescript/index.d.ts.map +1 -1
package/package.json +2 -2
package/src/index.tsx +121 -4
package/whisper-rn.podspec +15 -8

package/cpp/whisper.cpp CHANGED Viewed

@@ -547,13 +547,11 @@ struct whisper_decoder {
     std::vector<whisper_token> tokens_tmp; // used for whisper_decode calls
 };
-struct whisper_context {
-    int64_t t_load_us   = 0;
-    int64_t t_mel_us    = 0;
+struct whisper_state {
     int64_t t_sample_us = 0;
     int64_t t_encode_us = 0;
     int64_t t_decode_us = 0;
-    int64_t t_start_us  = 0;
+    int64_t t_mel_us = 0;
     int32_t n_sample = 0; // number of tokens sampled
     int32_t n_encode = 0; // number of encoder calls
@@ -561,16 +559,10 @@ struct whisper_context {
     int32_t n_fail_p = 0; // number of logprob threshold failures
     int32_t n_fail_h = 0; // number of entropy threshold failures
-    ggml_type wtype; // weight type (FP32 or FP16)
-    whisper_mel mel;
-    whisper_model model;
-    whisper_vocab vocab;
     // cross-attention KV cache for the decoders
     // shared between all decoders
     whisper_kv_cache kv_cross;
+    whisper_mel mel;
     whisper_decoder decoders[WHISPER_MAX_DECODERS] = {};
@@ -635,6 +627,18 @@ struct whisper_context {
     }
 };
+struct whisper_context {
+    int64_t t_load_us = 0;
+    int64_t t_start_us = 0;
+    ggml_type wtype = ggml_type::GGML_TYPE_F16; // weight type (FP32 or FP16)
+    whisper_model model;
+    whisper_vocab vocab;
+    whisper_state * state = nullptr;
+};
 template<typename T>
 static void read_safe(whisper_model_loader * loader, T & dest) {
     loader->read(loader->context, &dest, sizeof(T));
@@ -821,32 +825,8 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
         wctx.model.buf = new std::vector<uint8_t>();
         wctx.model.buf->resize(scale*MEM_REQ_MODEL.at(model.type));
-        if (!kv_cache_init(model.hparams, scale*MEM_REQ_KV_SELF.at(model.type), wctx.decoders[0].kv_self, wctx.wtype, model.hparams.n_text_ctx)) {
-            fprintf(stderr, "%s: kv_cache_init() failed for self-attention cache\n", __func__);
-            return false;
-        }
-        {
-            const size_t memory_size = ggml_nbytes(wctx.decoders[0].kv_self.k) + ggml_nbytes(wctx.decoders[0].kv_self.v);
-            fprintf(stderr, "%s: kv self size  = %7.2f MB\n", __func__, memory_size/1024.0/1024.0);
-        }
-        if (!kv_cache_init(model.hparams, scale*MEM_REQ_KV_CROSS.at(model.type), wctx.kv_cross, wctx.wtype, model.hparams.n_audio_ctx)) {
-            fprintf(stderr, "%s: kv_cache_init() failed for cross-attention cache\n", __func__);
-            return false;
-        }
-        {
-            const size_t memory_size = ggml_nbytes(wctx.kv_cross.k) + ggml_nbytes(wctx.kv_cross.v);
-            fprintf(stderr, "%s: kv cross size = %7.2f MB\n", __func__, memory_size/1024.0/1024.0);
-        }
-        wctx.buf_compute.resize(scale*std::max(MEM_REQ_ENCODE.at(model.type), MEM_REQ_DECODE.at(model.type)));
-        wctx.buf_scratch[0].resize(MEM_REQ_SCRATCH0.at(model.type));
-        wctx.buf_scratch[1].resize(MEM_REQ_SCRATCH1.at(model.type));
-        wctx.buf_scratch[2].resize(MEM_REQ_SCRATCH2.at(model.type));
-        wctx.buf_scratch[3].resize(MEM_REQ_SCRATCH3.at(model.type));
+        // we skip initialization of the state until it is needed
+        // because it might be that state will always be provided externally.
     }
     // load mel filters
@@ -929,17 +909,6 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
                 vocab.id_to_token[i] = word;
             }
         }
-        wctx.logits.reserve(vocab.n_vocab*model.hparams.n_text_ctx);
-        wctx.logits_id.reserve(n_vocab);
-        // TAGS: WHISPER_DECODER_INIT
-        wctx.decoders[0].sequence.tokens.reserve(model.hparams.n_text_ctx);
-        wctx.decoders[0].probs.reserve   (vocab.n_vocab);
-        wctx.decoders[0].logits.reserve  (vocab.n_vocab);
-        wctx.decoders[0].logprobs.reserve(vocab.n_vocab);
     }
     size_t ctx_size = 0;
@@ -1339,33 +1308,34 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
         }
     }
-    wctx.rng = std::mt19937(0);
     wctx.t_load_us = ggml_time_us() - t_start_us;
     return true;
 }
-// evaluate the encoder
+// evaluate the encoder with the given state
 //
 // given audio recording (more specifically, its log mel spectrogram), runs forward pass of the encoder
 // part of the transformer model and returns the encoded features
 //
-//   - model:      the model
+//   - wctx:      the model
+//   - wstate:     the state of the encoder
 //   - n_threads:  number of threads to use
 //   - mel_offset: offset in the mel spectrogram (i.e. audio offset)
 //
-static bool whisper_encode(
+static bool whisper_encode_internal(
         whisper_context & wctx,
+          whisper_state & wstate,
               const int   mel_offset,
-              const int   n_threads) {
+              const int   n_threads){
     const int64_t t_start_us = ggml_time_us();
     const auto & model   = wctx.model;
-    const auto & mel_inp = wctx.mel;
+    const auto & mel_inp = wstate.mel;
     const auto & hparams = model.hparams;
-    const int n_ctx   = wctx.exp_n_audio_ctx > 0 ? wctx.exp_n_audio_ctx : hparams.n_audio_ctx;
+    const int n_ctx   = wstate.exp_n_audio_ctx > 0 ? wstate.exp_n_audio_ctx : hparams.n_audio_ctx;
     const int n_state = hparams.n_audio_state;
     const int n_head  = hparams.n_audio_head;
     const int n_layer = hparams.n_audio_layer;
@@ -1374,12 +1344,12 @@ static bool whisper_encode(
     assert(mel_inp.n_mel == n_mels);
     struct ggml_init_params params;
-    params.mem_size   = wctx.buf_compute.size();
-    params.mem_buffer = wctx.buf_compute.data();
+    params.mem_size   = wstate.buf_compute.size();
+    params.mem_buffer = wstate.buf_compute.data();
     struct ggml_context * ctx0 = ggml_init(params);
-    wctx.use_buf(ctx0, 0);
+    wstate.use_buf(ctx0, 0);
     struct ggml_tensor * mel = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, 2*n_ctx, n_mels);
     assert(mel->type == GGML_TYPE_F32);
@@ -1401,30 +1371,30 @@ static bool whisper_encode(
     // convolution + gelu
     {
-        wctx.use_buf(ctx0, 1);
+        wstate.use_buf(ctx0, 1);
         cur = ggml_conv_1d_1s(ctx0, model.e_conv_1_w, mel);
         cur = ggml_add(ctx0,
-                ggml_repeat(ctx0,
-                    model.e_conv_1_b,
-                    cur),
-                cur);
+            ggml_repeat(ctx0,
+                model.e_conv_1_b,
+                cur),
+            cur);
         cur = ggml_gelu(ctx0, cur);
-        wctx.use_buf(ctx0, 0);
+        wstate.use_buf(ctx0, 0);
         cur = ggml_conv_1d_2s(ctx0, model.e_conv_2_w, cur);
         cur = ggml_add(ctx0,
-                ggml_repeat(ctx0,
-                    model.e_conv_2_b,
-                    cur),
-                cur);
+            ggml_repeat(ctx0,
+                model.e_conv_2_b,
+                cur),
+            cur);
         cur = ggml_gelu(ctx0, cur);
     }
-    wctx.use_buf(ctx0, 3);
+    wstate.use_buf(ctx0, 3);
     // ===================================================================
     // NOTE: experimenting with partial evaluation of the encoder (ignore)
@@ -1439,7 +1409,7 @@ static bool whisper_encode(
     //}
     static int iter = 0;
     const size_t e_pe_stride = model.e_pe->ne[0]*ggml_element_size(model.e_pe);
     const size_t e_pe_offset = model.e_pe->ne[0]*ggml_element_size(model.e_pe)*n_ctx*iter;
@@ -1459,54 +1429,54 @@ static bool whisper_encode(
         // norm
         {
-            wctx.use_buf(ctx0, 0);
+            wstate.use_buf(ctx0, 0);
             cur = ggml_norm(ctx0, inpL);
             // cur = ln_0_w*cur + ln_0_b
             cur = ggml_add(ctx0,
-                    ggml_mul(ctx0,
-                        ggml_repeat(ctx0, layer.attn_ln_0_w, cur),
-                        cur),
-                    ggml_repeat(ctx0, layer.attn_ln_0_b, cur));
+                ggml_mul(ctx0,
+                    ggml_repeat(ctx0, layer.attn_ln_0_w, cur),
+                    cur),
+                ggml_repeat(ctx0, layer.attn_ln_0_b, cur));
         }
         // self-attention
         {
-            wctx.use_buf(ctx0, 1);
+            wstate.use_buf(ctx0, 1);
             struct ggml_tensor * Qcur = ggml_mul_mat(ctx0,
-                    layer.attn_q_w,
-                    cur);
+                layer.attn_q_w,
+                cur);
             Qcur = ggml_add(ctx0,
-                    ggml_repeat(ctx0,
-                        layer.attn_q_b,
-                        Qcur),
-                    Qcur);
+                ggml_repeat(ctx0,
+                    layer.attn_q_b,
+                    Qcur),
+                Qcur);
             //Qcur = ggml_scale(ctx0, Qcur, ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25)));
             // note: no bias for Key
             struct ggml_tensor * Kcur = ggml_mul_mat(ctx0,
-                    layer.attn_k_w,
-                    cur);
+                layer.attn_k_w,
+                cur);
             //Kcur = ggml_scale(ctx0, Kcur, ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25)));
             struct ggml_tensor * Vcur = ggml_mul_mat(ctx0,
-                    layer.attn_v_w,
-                    cur);
+                layer.attn_v_w,
+                cur);
             Vcur = ggml_add(ctx0,
-                    ggml_repeat(ctx0,
-                        layer.attn_v_b,
-                        Vcur),
-                    Vcur);
+                ggml_repeat(ctx0,
+                    layer.attn_v_b,
+                    Vcur),
+                Vcur);
             // ------
-            wctx.use_buf(ctx0, 0);
+            wstate.use_buf(ctx0, 0);
 #ifdef WHISPER_USE_FLASH_ATTN
             struct ggml_tensor * Q =
@@ -1583,29 +1553,29 @@ static bool whisper_encode(
 #endif
             struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
-            wctx.use_buf(ctx0, 1);
+            wstate.use_buf(ctx0, 1);
             cur = ggml_cpy(ctx0,
-                    KQV_merged,
-                    ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_state, n_ctx));
+                KQV_merged,
+                ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_state, n_ctx));
         }
         // projection
         {
-            wctx.use_buf(ctx0, 0);
+            wstate.use_buf(ctx0, 0);
             cur = ggml_mul_mat(ctx0,
-                    layer.attn_ln_1_w,
-                    cur);
+                layer.attn_ln_1_w,
+                cur);
-            wctx.use_buf(ctx0, 1);
+            wstate.use_buf(ctx0, 1);
             cur = ggml_add(ctx0,
-                    ggml_repeat(ctx0, layer.attn_ln_1_b, cur),
-                    cur);
+                ggml_repeat(ctx0, layer.attn_ln_1_b, cur),
+                cur);
         }
-        wctx.use_buf(ctx0, 2);
+        wstate.use_buf(ctx0, 2);
         // add the input
         cur = ggml_add(ctx0, cur, inpL);
@@ -1616,61 +1586,61 @@ static bool whisper_encode(
         {
             // norm
             {
-                wctx.use_buf(ctx0, 0);
+                wstate.use_buf(ctx0, 0);
                 cur = ggml_norm(ctx0, inpFF);
-                wctx.use_buf(ctx0, 1);
+                wstate.use_buf(ctx0, 1);
                 // cur = mlp_ln_w*cur + mlp_ln_b
                 cur = ggml_add(ctx0,
-                        ggml_mul(ctx0,
-                            ggml_repeat(ctx0, layer.mlp_ln_w, cur),
-                            cur),
-                        ggml_repeat(ctx0, layer.mlp_ln_b, cur));
-            }
+                    ggml_mul(ctx0,
+                        ggml_repeat(ctx0, layer.mlp_ln_w, cur),
+                        cur),
+                    ggml_repeat(ctx0, layer.mlp_ln_b, cur));
+    }
 #ifdef WHISPER_USE_FLASH_FF
-            wctx.use_buf(ctx0, 0);
+            wstate.use_buf(ctx0, 0);
             cur = ggml_flash_ff(ctx0,
-                    ggml_cpy(ctx0, cur, ggml_new_tensor_2d(ctx0, wctx.wtype, n_state, n_ctx)),
-                    layer.mlp_0_w, layer.mlp_0_b, layer.mlp_1_w, layer.mlp_1_b);
+                ggml_cpy(ctx0, cur, ggml_new_tensor_2d(ctx0, wstate.wtype, n_state, n_ctx)),
+                layer.mlp_0_w, layer.mlp_0_b, layer.mlp_1_w, layer.mlp_1_b);
 #else
-            wctx.use_buf(ctx0, 0);
+            wstate.use_buf(ctx0, 0);
             // fully connected
             cur = ggml_mul_mat(ctx0,
-                    layer.mlp_0_w,
-                    cur);
+                layer.mlp_0_w,
+                cur);
-            wctx.use_buf(ctx0, 1);
+            wstate.use_buf(ctx0, 1);
             cur = ggml_add(ctx0,
-                    ggml_repeat(ctx0, layer.mlp_0_b, cur),
-                    cur);
+                ggml_repeat(ctx0, layer.mlp_0_b, cur),
+                cur);
-            wctx.use_buf(ctx0, 0);
+            wstate.use_buf(ctx0, 0);
             // GELU activation
             cur = ggml_gelu(ctx0, cur);
-            wctx.use_buf(ctx0, 1);
+            wstate.use_buf(ctx0, 1);
             // projection
             cur = ggml_mul_mat(ctx0,
-                    layer.mlp_1_w,
-                    cur);
+                layer.mlp_1_w,
+                cur);
-            wctx.use_buf(ctx0, 0);
+            wstate.use_buf(ctx0, 0);
             cur = ggml_add(ctx0,
-                    ggml_repeat(ctx0, layer.mlp_1_b, cur),
-                    cur);
+                ggml_repeat(ctx0, layer.mlp_1_b, cur),
+                cur);
 #endif
-        }
+}
-        wctx.use_buf(ctx0, 3);
+        wstate.use_buf(ctx0, 3);
         inpL = ggml_add(ctx0, cur, inpFF);
     }
@@ -1679,21 +1649,21 @@ static bool whisper_encode(
     // norm
     {
-        wctx.use_buf(ctx0, 0);
+        wstate.use_buf(ctx0, 0);
         cur = ggml_norm(ctx0, cur);
-        wctx.use_buf(ctx0, 1);
+        wstate.use_buf(ctx0, 1);
         // cur = ln_f_g*cur + ln_f_b
         cur = ggml_add(ctx0,
-                ggml_mul(ctx0,
-                    ggml_repeat(ctx0, model.e_ln_w, cur),
-                    cur),
-                ggml_repeat(ctx0, model.e_ln_b, cur));
+            ggml_mul(ctx0,
+                ggml_repeat(ctx0, model.e_ln_w, cur),
+                cur),
+            ggml_repeat(ctx0, model.e_ln_b, cur));
     }
-    wctx.use_buf(ctx0, -1);
+    wstate.use_buf(ctx0, -1);
     // run the computation
     {
@@ -1701,7 +1671,7 @@ static bool whisper_encode(
         gf.n_threads = n_threads;
         ggml_build_forward_expand(&gf, cur);
-        ggml_graph_compute       (ctx0, &gf);
+        ggml_graph_compute(ctx0, &gf);
         //ggml_graph_print(&gf);
     }
@@ -1731,34 +1701,34 @@ static bool whisper_encode(
         cur->src1 = nullptr;
         for (int il = 0; il < model.hparams.n_text_layer; ++il) {
-            auto & layer = model.layers_decoder[il];
+            auto& layer = model.layers_decoder[il];
-            wctx.use_buf(ctx0, 0);
+            wstate.use_buf(ctx0, 0);
-            struct ggml_tensor * Kcross = ggml_mul_mat(ctx0,
-                    layer.cross_attn_k_w,
-                    cur);
+            struct ggml_tensor* Kcross = ggml_mul_mat(ctx0,
+                layer.cross_attn_k_w,
+                cur);
-            Kcross = ggml_scale(ctx0, Kcross, ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25)));
+            Kcross = ggml_scale(ctx0, Kcross, ggml_new_f32(ctx0, pow(float(n_state) / n_head, -0.25)));
-            wctx.use_buf(ctx0, 1);
+            wstate.use_buf(ctx0, 1);
-            struct ggml_tensor * Vcross = ggml_mul_mat(ctx0,
-                    layer.cross_attn_v_w,
-                    cur);
+            struct ggml_tensor* Vcross = ggml_mul_mat(ctx0,
+                layer.cross_attn_v_w,
+                cur);
             Vcross = ggml_add(ctx0,
-                    ggml_repeat(ctx0,
-                        layer.cross_attn_v_b,
-                        Vcross),
-                    Vcross);
+                ggml_repeat(ctx0,
+                    layer.cross_attn_v_b,
+                    Vcross),
+                Vcross);
-            wctx.use_buf(ctx0, -1);
+            wstate.use_buf(ctx0, -1);
-            //struct ggml_tensor * k = ggml_view_1d(ctx0, wctx.kv_cross.k, n_state*n_ctx, (ggml_element_size(wctx.kv_cross.k)*n_state)*(il*hparams.n_audio_ctx + iter*n_ctx));
-            //struct ggml_tensor * v = ggml_view_1d(ctx0, wctx.kv_cross.v, n_state*n_ctx, (ggml_element_size(wctx.kv_cross.v)*n_state)*(il*hparams.n_audio_ctx + iter*n_ctx));
-            struct ggml_tensor * k = ggml_view_1d(ctx0, wctx.kv_cross.k, n_state*n_ctx, (ggml_element_size(wctx.kv_cross.k)*n_state)*(il*n_ctx));
-            struct ggml_tensor * v = ggml_view_1d(ctx0, wctx.kv_cross.v, n_state*n_ctx, (ggml_element_size(wctx.kv_cross.v)*n_state)*(il*n_ctx));
+            //struct ggml_tensor * k = ggml_view_1d(ctx0, wstate.kv_cross.k, n_state*n_ctx, (ggml_element_size(wstate.kv_cross.k)*n_state)*(il*hparams.n_audio_ctx + iter*n_ctx));
+            //struct ggml_tensor * v = ggml_view_1d(ctx0, wstate.kv_cross.v, n_state*n_ctx, (ggml_element_size(wstate.kv_cross.v)*n_state)*(il*hparams.n_audio_ctx + iter*n_ctx));
+            struct ggml_tensor* k = ggml_view_1d(ctx0, wstate.kv_cross.k, n_state*n_ctx, (ggml_element_size(wstate.kv_cross.k)*n_state)*(il*n_ctx));
+            struct ggml_tensor* v = ggml_view_1d(ctx0, wstate.kv_cross.v, n_state*n_ctx, (ggml_element_size(wstate.kv_cross.v)*n_state)*(il*n_ctx));
             ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Kcross, k));
             ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Vcross, v));
@@ -1779,8 +1749,8 @@ static bool whisper_encode(
     ggml_free(ctx0);
-    wctx.t_encode_us += ggml_time_us() - t_start_us;
-    wctx.n_encode++;
+    wstate.t_encode_us += ggml_time_us() - t_start_us;
+    wstate.n_encode++;
     return true;
 }
@@ -1795,8 +1765,9 @@ static bool whisper_encode(
 //   - n_tokens:   number of tokens in the prompt
 //   - n_past:     number of past tokens to prefix the prompt with
 //
-static bool whisper_decode(
+static bool whisper_decode_internal(
         whisper_context & wctx,
+          whisper_state & wstate,
         whisper_decoder & decoder,
     const whisper_token * tokens,
               const int   n_tokens,
@@ -1811,7 +1782,7 @@ static bool whisper_decode(
     WHISPER_ASSERT(!!kv_self.ctx);
-    auto & logits_out = wctx.logits;
+    auto & logits_out = wstate.logits;
     const int n_vocab = hparams.n_vocab;
@@ -1821,13 +1792,13 @@ static bool whisper_decode(
     const int n_layer = hparams.n_text_layer;
     const int N = n_tokens;
-    const int M = wctx.exp_n_audio_ctx > 0 ? wctx.exp_n_audio_ctx : hparams.n_audio_ctx;
+    const int M = wstate.exp_n_audio_ctx > 0 ? wstate.exp_n_audio_ctx : hparams.n_audio_ctx;
     //WHISPER_PRINT_DEBUG("%s: n_past = %d, N = %d, M = %d, n_ctx = %d\n", __func__, n_past, N, M, n_ctx);
     struct ggml_init_params params;
-    params.mem_size   = wctx.buf_compute.size();
-    params.mem_buffer = wctx.buf_compute.data();
+    params.mem_size   = wstate.buf_compute.size();
+    params.mem_buffer = wstate.buf_compute.data();
     struct ggml_context * ctx0 = ggml_init(params);
@@ -1842,7 +1813,7 @@ static bool whisper_decode(
         ((int32_t *) position->data)[i] = n_past + i;
     }
-    wctx.use_buf(ctx0, 3);
+    wstate.use_buf(ctx0, 3);
     // token encoding + position encoding
     struct ggml_tensor * cur =
@@ -1857,7 +1828,7 @@ static bool whisper_decode(
         // norm
         {
-            wctx.use_buf(ctx0, 0);
+            wstate.use_buf(ctx0, 0);
             cur = ggml_norm(ctx0, inpL);
@@ -1871,7 +1842,7 @@ static bool whisper_decode(
         // self-attention
         {
-            wctx.use_buf(ctx0, 1);
+            wstate.use_buf(ctx0, 1);
             struct ggml_tensor * Qcur = ggml_mul_mat(ctx0,
                     layer.attn_q_w,
@@ -1913,7 +1884,7 @@ static bool whisper_decode(
             // ------
-            wctx.use_buf(ctx0, 0);
+            wstate.use_buf(ctx0, 0);
             struct ggml_tensor * Q =
                 ggml_permute(ctx0,
@@ -1929,12 +1900,12 @@ static bool whisper_decode(
                             n_state/n_head, n_head, n_past + N),
                         0, 2, 1, 3);
-            wctx.use_buf(ctx0, 1);
+            wstate.use_buf(ctx0, 1);
             // K * Q
             struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
-            wctx.use_buf(ctx0, 0);
+            wstate.use_buf(ctx0, 0);
             //struct ggml_tensor * KQ_scaled =
             //    ggml_scale(ctx0,
@@ -1944,11 +1915,11 @@ static bool whisper_decode(
             struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ, n_past);
-            wctx.use_buf(ctx0, 1);
+            wstate.use_buf(ctx0, 1);
             struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
-            wctx.use_buf(ctx0, 0);
+            wstate.use_buf(ctx0, 0);
             struct ggml_tensor * V_trans =
                 ggml_permute(ctx0,
@@ -1957,7 +1928,7 @@ static bool whisper_decode(
                             n_state/n_head, n_head, n_past + N),
                         1, 2, 0, 3);
-            wctx.use_buf(ctx0, 1);
+            wstate.use_buf(ctx0, 1);
             struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_trans, KQ_soft_max);
@@ -1970,31 +1941,31 @@ static bool whisper_decode(
         // projection
         {
-            wctx.use_buf(ctx0, 0);
+            wstate.use_buf(ctx0, 0);
             cur = ggml_mul_mat(ctx0,
                     layer.attn_ln_1_w,
                     cur);
-            wctx.use_buf(ctx0, 1);
+            wstate.use_buf(ctx0, 1);
             cur = ggml_add(ctx0,
                     ggml_repeat(ctx0, layer.attn_ln_1_b, cur),
                     cur);
         }
-        wctx.use_buf(ctx0, 2);
+        wstate.use_buf(ctx0, 2);
         // add the input
         struct ggml_tensor * inpCA = ggml_add(ctx0, cur, inpL);
         // norm
         {
-            wctx.use_buf(ctx0, 0);
+            wstate.use_buf(ctx0, 0);
             cur = ggml_norm(ctx0, inpCA); // note: we use inpCA here
-            wctx.use_buf(ctx0, 1);
+            wstate.use_buf(ctx0, 1);
             // cur = ln_0_w*cur + ln_0_b
             cur = ggml_add(ctx0,
@@ -2006,7 +1977,7 @@ static bool whisper_decode(
         // cross-attention
         {
-            wctx.use_buf(ctx0, 0);
+            wstate.use_buf(ctx0, 0);
             struct ggml_tensor * Qcur = ggml_mul_mat(ctx0,
                     layer.cross_attn_q_w,
@@ -2023,19 +1994,19 @@ static bool whisper_decode(
             // Kcross is already scaled
             struct ggml_tensor * Kcross =
                 ggml_reshape_3d(ctx0,
-                        ggml_view_1d(ctx0, wctx.kv_cross.k, M*n_state, il*M*ggml_element_size(wctx.kv_cross.k)*n_state),
+                        ggml_view_1d(ctx0, wstate.kv_cross.k, M*n_state, il*M*ggml_element_size(wstate.kv_cross.k)*n_state),
                         n_state/n_head, n_head, M);
             struct ggml_tensor * Vcross =
                 ggml_reshape_3d(ctx0,
-                        ggml_view_1d(ctx0, wctx.kv_cross.v, M*n_state, il*M*ggml_element_size(wctx.kv_cross.v)*n_state),
+                        ggml_view_1d(ctx0, wstate.kv_cross.v, M*n_state, il*M*ggml_element_size(wstate.kv_cross.v)*n_state),
                         n_state/n_head, n_head, M);
             struct ggml_tensor * V_trans = ggml_permute(ctx0, Vcross, 1, 2, 0, 3);
             // ------
-            wctx.use_buf(ctx0, 1);
+            wstate.use_buf(ctx0, 1);
             struct ggml_tensor * Q =
                 ggml_permute(ctx0,
@@ -2046,7 +2017,7 @@ static bool whisper_decode(
             struct ggml_tensor * K = ggml_permute(ctx0, Kcross, 0, 2, 1, 3);
-            wctx.use_buf(ctx0, 0);
+            wstate.use_buf(ctx0, 0);
             // K * Q
             struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
@@ -2060,15 +2031,15 @@ static bool whisper_decode(
             // no masking for cross-attention
             //struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled, n_past);
-            wctx.use_buf(ctx0, 1);
+            wstate.use_buf(ctx0, 1);
             struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ);
-            wctx.use_buf(ctx0, 0);
+            wstate.use_buf(ctx0, 0);
             struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_trans, KQ_soft_max);
-            wctx.use_buf(ctx0, 1);
+            wstate.use_buf(ctx0, 1);
             struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
@@ -2080,20 +2051,20 @@ static bool whisper_decode(
         // projection
         {
-            wctx.use_buf(ctx0, 0);
+            wstate.use_buf(ctx0, 0);
             cur = ggml_mul_mat(ctx0,
                     layer.cross_attn_ln_1_w,
                     cur);
-            wctx.use_buf(ctx0, 1);
+            wstate.use_buf(ctx0, 1);
             cur = ggml_add(ctx0,
                     ggml_repeat(ctx0, layer.cross_attn_ln_1_b, cur),
                     cur);
         }
-        wctx.use_buf(ctx0, 2);
+        wstate.use_buf(ctx0, 2);
         // add the input
         cur = ggml_add(ctx0, cur, inpCA);
@@ -2104,11 +2075,11 @@ static bool whisper_decode(
         {
             // norm
             {
-                wctx.use_buf(ctx0, 0);
+                wstate.use_buf(ctx0, 0);
                 cur = ggml_norm(ctx0, inpFF);
-                wctx.use_buf(ctx0, 1);
+                wstate.use_buf(ctx0, 1);
                 // cur = mlp_ln_w*cur + mlp_ln_b
                 cur = ggml_add(ctx0,
@@ -2118,39 +2089,39 @@ static bool whisper_decode(
                         ggml_repeat(ctx0, layer.mlp_ln_b, cur));
             }
-            wctx.use_buf(ctx0, 0);
+            wstate.use_buf(ctx0, 0);
             // fully connected
             cur = ggml_mul_mat(ctx0,
                     layer.mlp_0_w,
                     cur);
-            wctx.use_buf(ctx0, 1);
+            wstate.use_buf(ctx0, 1);
             cur = ggml_add(ctx0,
                     ggml_repeat(ctx0, layer.mlp_0_b, cur),
                     cur);
-            wctx.use_buf(ctx0, 0);
+            wstate.use_buf(ctx0, 0);
             // GELU activation
             cur = ggml_gelu(ctx0, cur);
-            wctx.use_buf(ctx0, 1);
+            wstate.use_buf(ctx0, 1);
             // projection
             cur = ggml_mul_mat(ctx0,
                     layer.mlp_1_w,
                     cur);
-            wctx.use_buf(ctx0, 0);
+            wstate.use_buf(ctx0, 0);
             cur = ggml_add(ctx0,
                     ggml_repeat(ctx0, layer.mlp_1_b, cur),
                     cur);
         }
-        wctx.use_buf(ctx0, 3);
+        wstate.use_buf(ctx0, 3);
         inpL = ggml_add(ctx0, cur, inpFF);
     }
@@ -2159,11 +2130,11 @@ static bool whisper_decode(
     // norm
     {
-        wctx.use_buf(ctx0, 0);
+        wstate.use_buf(ctx0, 0);
         cur = ggml_norm(ctx0, cur);
-        wctx.use_buf(ctx0, 1);
+        wstate.use_buf(ctx0, 1);
         cur = ggml_add(ctx0,
                 ggml_mul(ctx0,
@@ -2172,7 +2143,7 @@ static bool whisper_decode(
                 ggml_repeat(ctx0, model.d_ln_b, cur));
     }
-    wctx.use_buf(ctx0, 0);
+    wstate.use_buf(ctx0, 0);
     // compute logits only for the last token
     // comment this line to compute logits for all N tokens
@@ -2181,7 +2152,7 @@ static bool whisper_decode(
     struct ggml_tensor * logits = ggml_mul_mat(ctx0, model.d_te, cur);
-    wctx.use_buf(ctx0, -1);
+    wstate.use_buf(ctx0, -1);
     // run the computation
     {
@@ -2208,8 +2179,8 @@ static bool whisper_decode(
     ggml_free(ctx0);
-    wctx.t_decode_us += ggml_time_us() - t_start_us;
-    wctx.n_decode++;
+    wstate.t_decode_us += ggml_time_us() - t_start_us;
+    wstate.n_decode++;
     return true;
 }
@@ -2313,7 +2284,7 @@ static void fft(const std::vector<float> & in, std::vector<float> & out) {
 // ref: https://github.com/openai/whisper/blob/main/whisper/audio.py#L92-L124
 static bool log_mel_spectrogram(
-        whisper_context & wctx,
+          whisper_state & wstate,
             const float * samples,
               const int   n_samples,
               const int   /*sample_rate*/,
@@ -2433,7 +2404,7 @@ static bool log_mel_spectrogram(
         mel.data[i] = (mel.data[i] + 4.0)/4.0;
     }
-    wctx.t_mel_us += ggml_time_us() - t_start_us;
+    wstate.t_mel_us += ggml_time_us() - t_start_us;
     return true;
 }
@@ -2507,7 +2478,56 @@ static std::vector<whisper_vocab::id> tokenize(const whisper_vocab & vocab, cons
 // interface implementation
 //
-struct whisper_context * whisper_init_from_file(const char * path_model) {
+struct whisper_state * whisper_init_state(whisper_context * ctx) {
+    whisper_state * state = new whisper_state;
+    const size_t scale = ctx->model.hparams.f16 ? 1 : 2;
+    if (!kv_cache_init(ctx->model.hparams, scale * MEM_REQ_KV_SELF.at(ctx->model.type), state->decoders[0].kv_self, ctx->wtype, ctx->model.hparams.n_text_ctx)) {
+        fprintf(stderr, "%s: kv_cache_init() failed for self-attention cache\n", __func__);
+        return nullptr;
+    }
+    {
+        const size_t memory_size = ggml_nbytes(state->decoders[0].kv_self.k) + ggml_nbytes(state->decoders[0].kv_self.v);
+        fprintf(stderr, "%s: kv self size  = %7.2f MB\n", __func__, memory_size / 1024.0 / 1024.0);
+    }
+    if (!kv_cache_init(ctx->model.hparams, scale * MEM_REQ_KV_CROSS.at(ctx->model.type), state->kv_cross, ctx->wtype, ctx->model.hparams.n_audio_ctx)) {
+        fprintf(stderr, "%s: kv_cache_init() failed for cross-attention cache\n", __func__);
+        return nullptr;
+    }
+    {
+        const size_t memory_size = ggml_nbytes(state->kv_cross.k) + ggml_nbytes(state->kv_cross.v);
+        fprintf(stderr, "%s: kv cross size = %7.2f MB\n", __func__, memory_size / 1024.0 / 1024.0);
+    }
+    state->logits.reserve(ctx->vocab.n_vocab * ctx->model.hparams.n_text_ctx);
+    state->logits_id.reserve(ctx->model.hparams.n_vocab);
+    // TAGS: WHISPER_DECODER_INIT
+    state->decoders[0].sequence.tokens.reserve(ctx->model.hparams.n_text_ctx);
+    state->decoders[0].probs.reserve(ctx->vocab.n_vocab);
+    state->decoders[0].logits.reserve(ctx->vocab.n_vocab);
+    state->decoders[0].logprobs.reserve(ctx->vocab.n_vocab);
+    state->buf_compute.resize(scale * std::max(MEM_REQ_ENCODE.at(ctx->model.type), MEM_REQ_DECODE.at(ctx->model.type)));
+    state->buf_scratch[0].resize(MEM_REQ_SCRATCH0.at(ctx->model.type));
+    state->buf_scratch[1].resize(MEM_REQ_SCRATCH1.at(ctx->model.type));
+    state->buf_scratch[2].resize(MEM_REQ_SCRATCH2.at(ctx->model.type));
+    state->buf_scratch[3].resize(MEM_REQ_SCRATCH3.at(ctx->model.type));
+    state->rng = std::mt19937(0);
+    return state;
+}
+struct whisper_context * whisper_init_from_file_no_state(const char * path_model) {
     whisper_model_loader loader = {};
     fprintf(stderr, "%s: loading model from '%s'\n", __func__, path_model);
@@ -2535,10 +2555,10 @@ struct whisper_context * whisper_init_from_file(const char * path_model) {
         fin->close();
     };
-    return whisper_init(&loader);
+    return whisper_init_no_state(&loader);
 }
-struct whisper_context * whisper_init_from_buffer(void * buffer, size_t buffer_size) {
+struct whisper_context * whisper_init_from_buffer_no_state(void * buffer, size_t buffer_size) {
     struct buf_context {
         uint8_t* buffer;
         size_t size;
@@ -2571,10 +2591,10 @@ struct whisper_context * whisper_init_from_buffer(void * buffer, size_t buffer_s
     loader.close = [](void * /*ctx*/) { };
-    return whisper_init(&loader);
+    return whisper_init_no_state(&loader);
 }
-struct whisper_context * whisper_init(struct whisper_model_loader * loader) {
+struct whisper_context * whisper_init_no_state(struct whisper_model_loader * loader) {
     ggml_time_init();
     whisper_context * ctx = new whisper_context;
@@ -2591,6 +2611,64 @@ struct whisper_context * whisper_init(struct whisper_model_loader * loader) {
     return ctx;
 }
+struct whisper_context * whisper_init_from_file(const char * path_model) {
+    whisper_context * ctx = whisper_init_from_file_no_state(path_model);
+    if (!ctx) {
+        return nullptr;
+    }
+    ctx->state = whisper_init_state(ctx);
+    if (!ctx->state) {
+        whisper_free(ctx);
+        return nullptr;
+    }
+    return ctx;
+}
+struct whisper_context * whisper_init_from_buffer(void * buffer, size_t buffer_size) {
+    whisper_context * ctx = whisper_init_from_buffer_no_state(buffer, buffer_size);
+    if (!ctx) {
+        return nullptr;
+    }
+    ctx->state = whisper_init_state(ctx);
+    if (!ctx->state) {
+        whisper_free(ctx);
+        return nullptr;
+    }
+    return ctx;
+}
+struct whisper_context * whisper_init(struct whisper_model_loader * loader) {
+    whisper_context * ctx = whisper_init_no_state(loader);
+    if (!ctx) {
+        return nullptr;
+    }
+    ctx->state = whisper_init_state(ctx);
+    if (!ctx->state) {
+        whisper_free(ctx);
+        return nullptr;
+    }
+    return ctx;
+}
+void whisper_free_state(struct whisper_state * state)
+{
+    if (state) {
+        kv_cache_free(state->kv_cross);
+        for (int i = 0; i < WHISPER_MAX_DECODERS; ++i) {
+            kv_cache_free(state->decoders[i].kv_self);
+        }
+        delete state;
+    }
+}
 void whisper_free(struct whisper_context * ctx) {
     if (ctx) {
         if (ctx->model.ctx) {
@@ -2599,20 +2677,29 @@ void whisper_free(struct whisper_context * ctx) {
         if (ctx->model.buf) {
             delete ctx->model.buf;
         }
-        if (ctx->kv_cross.ctx) {
-            ggml_free(ctx->kv_cross.ctx);
-        }
-        for (int i = 0; i < WHISPER_MAX_DECODERS; ++i) {
-            if (ctx->decoders[i].kv_self.ctx) {
-                ggml_free(ctx->decoders[i].kv_self.ctx);
-            }
-        }
+        whisper_free_state(ctx->state);
         delete ctx;
     }
 }
+int whisper_pcm_to_mel_with_state(struct whisper_context * ctx, struct whisper_state * state, const float * samples, int n_samples, int n_threads) {
+    if (!log_mel_spectrogram(*state, samples, n_samples, WHISPER_SAMPLE_RATE, WHISPER_N_FFT, WHISPER_HOP_LENGTH, WHISPER_N_MEL, n_threads, ctx->model.filters, false, state->mel)) {
+        fprintf(stderr, "%s: failed to compute mel spectrogram\n", __func__);
+        return -1;
+    }
+    return 0;
+}
 int whisper_pcm_to_mel(struct whisper_context * ctx, const float * samples, int n_samples, int n_threads) {
-    if (!log_mel_spectrogram(*ctx, samples, n_samples, WHISPER_SAMPLE_RATE, WHISPER_N_FFT, WHISPER_HOP_LENGTH, WHISPER_N_MEL, n_threads, ctx->model.filters, false, ctx->mel)) {
+    return whisper_pcm_to_mel_with_state(ctx, ctx->state, samples, n_samples, n_threads);
+}
+// same as whisper_pcm_to_mel, but applies a Phase Vocoder to speed up the audio x2
+int whisper_pcm_to_mel_phase_vocoder_with_state(struct whisper_context * ctx, struct whisper_state * state, const float * samples, int n_samples, int n_threads) {
+    if (!log_mel_spectrogram(*state, samples, n_samples, WHISPER_SAMPLE_RATE, 2 * WHISPER_N_FFT, 2 * WHISPER_HOP_LENGTH, WHISPER_N_MEL, n_threads, ctx->model.filters, true, state->mel)) {
         fprintf(stderr, "%s: failed to compute mel spectrogram\n", __func__);
         return -1;
     }
@@ -2622,11 +2709,26 @@ int whisper_pcm_to_mel(struct whisper_context * ctx, const float * samples, int
 // same as whisper_pcm_to_mel, but applies a Phase Vocoder to speed up the audio x2
 int whisper_pcm_to_mel_phase_vocoder(struct whisper_context * ctx, const float * samples, int n_samples, int n_threads) {
-    if (!log_mel_spectrogram(*ctx, samples, n_samples, WHISPER_SAMPLE_RATE, 2*WHISPER_N_FFT, 2*WHISPER_HOP_LENGTH, WHISPER_N_MEL, n_threads, ctx->model.filters, true, ctx->mel)) {
-        fprintf(stderr, "%s: failed to compute mel spectrogram\n", __func__);
+    return whisper_pcm_to_mel_phase_vocoder_with_state(ctx, ctx->state, samples, n_samples, n_threads);
+}
+int whisper_set_mel_with_state(
+        struct whisper_context * /*ctx*/,
+          struct whisper_state * state,
+                   const float * data,
+                           int   n_len,
+                           int   n_mel) {
+    if (n_mel != WHISPER_N_MEL) {
+        fprintf(stderr, "%s: invalid number of mel bands: %d (expected %d)\n", __func__, n_mel, WHISPER_N_MEL);
         return -1;
     }
+    state->mel.n_len = n_len;
+    state->mel.n_mel = n_mel;
+    state->mel.data.resize(n_len*n_mel);
+    memcpy(state->mel.data.data(), data, n_len*n_mel*sizeof(float));
     return 0;
 }
@@ -2635,22 +2737,20 @@ int whisper_set_mel(
         const float * data,
         int n_len,
         int n_mel) {
-    if (n_mel != WHISPER_N_MEL) {
-        fprintf(stderr, "%s: invalid number of mel bands: %d (expected %d)\n", __func__, n_mel, WHISPER_N_MEL);
+    return whisper_set_mel_with_state(ctx, ctx->state, data, n_len, n_mel);
+}
+int whisper_encode_with_state(struct whisper_context * ctx, struct whisper_state * state, int offset, int n_threads) {
+    if (!whisper_encode_internal(*ctx, *state, offset, n_threads)) {
+        fprintf(stderr, "%s: failed to eval\n", __func__);
         return -1;
     }
-    ctx->mel.n_len = n_len;
-    ctx->mel.n_mel = n_mel;
-    ctx->mel.data.resize(n_len*n_mel);
-    memcpy(ctx->mel.data.data(), data, n_len*n_mel*sizeof(float));
     return 0;
 }
 int whisper_encode(struct whisper_context * ctx, int offset, int n_threads) {
-    if (!whisper_encode(*ctx, offset, n_threads)) {
+    if (!whisper_encode_internal(*ctx, *ctx->state, offset, n_threads)) {
         fprintf(stderr, "%s: failed to eval\n", __func__);
         return -1;
     }
@@ -2658,11 +2758,28 @@ int whisper_encode(struct whisper_context * ctx, int offset, int n_threads) {
     return 0;
 }
+int whisper_decode_with_state(struct whisper_context * ctx, struct whisper_state * state, const whisper_token * tokens, int n_tokens, int n_past, int n_threads) {
+    const int selected_decoder_id = 0;
+    if (!whisper_decode_internal(*ctx, *state, state->decoders[selected_decoder_id], tokens, n_tokens, n_past, n_threads)) {
+        fprintf(stderr, "%s: failed to eval\n", __func__);
+        return 1;
+    }
+    return 0;
+}
 int whisper_decode(struct whisper_context * ctx, const whisper_token * tokens, int n_tokens, int n_past, int n_threads) {
-    // TODO: add selected_decoder_id to context
+    // TODO: add selected_decoder_id to state
     const int selected_decoder_id = 0;
-    if (!whisper_decode(*ctx, ctx->decoders[selected_decoder_id], tokens, n_tokens, n_past, n_threads)) {
+    if (ctx->state == nullptr) {
+        fprintf(stderr, "%s: ERROR state was not loaded.\n", __func__);
+        return false;
+    }
+    if (!whisper_decode_internal(*ctx, *ctx->state, ctx->state->decoders[selected_decoder_id], tokens, n_tokens, n_past, n_threads)) {
         fprintf(stderr, "%s: failed to eval\n", __func__);
         return 1;
     }
@@ -2720,11 +2837,12 @@ const char * whisper_lang_str(int id) {
     return nullptr;
 }
-int whisper_lang_auto_detect(
+int whisper_lang_auto_detect_with_state(
         struct whisper_context * ctx,
-        int offset_ms,
-        int n_threads,
-        float * lang_probs) {
+          struct whisper_state * state,
+                           int   offset_ms,
+                           int   n_threads,
+                         float * lang_probs) {
     const int seek = offset_ms/10;
     if (seek < 0) {
@@ -2732,8 +2850,8 @@ int whisper_lang_auto_detect(
         return -1;
     }
-    if (seek >= ctx->mel.n_len) {
-        fprintf(stderr, "%s: offset %dms is past the end of the audio (%dms)\n", __func__, offset_ms, ctx->mel.n_len*10);
+    if (seek >= state->mel.n_len) {
+        fprintf(stderr, "%s: offset %dms is past the end of the audio (%dms)\n", __func__, offset_ms, state->mel.n_len*10);
         return -2;
     }
@@ -2745,17 +2863,17 @@ int whisper_lang_auto_detect(
     const std::vector<whisper_token> prompt = { whisper_token_sot(ctx) };
-    if (whisper_decode(ctx, prompt.data(), prompt.size(), 0, n_threads) != 0) {
+    if (whisper_decode_with_state(ctx, state, prompt.data(), prompt.size(), 0, n_threads) != 0) {
         fprintf(stderr, "%s: failed to decode\n", __func__);
         return -7;
     }
-    auto & logits_id = ctx->logits_id;
+    auto & logits_id = state->logits_id;
     logits_id.clear();
     for (const auto & kv : g_lang) {
         const auto token_lang = whisper_token_lang(ctx, kv.second.first);
-        logits_id.emplace_back(ctx->logits[token_lang], kv.second.first);
+        logits_id.emplace_back(state->logits[token_lang], kv.second.first);
     }
     // sort descending
@@ -2794,8 +2912,20 @@ int whisper_lang_auto_detect(
     return logits_id[0].second;
 }
+int whisper_lang_auto_detect(
+        struct whisper_context * ctx,
+                           int   offset_ms,
+                           int   n_threads,
+                         float * lang_probs) {
+    return whisper_lang_auto_detect_with_state(ctx, ctx->state, offset_ms, n_threads, lang_probs);
+}
+int whisper_n_len_from_state(struct whisper_state * state) {
+    return state->mel.n_len;
+}
 int whisper_n_len(struct whisper_context * ctx) {
-    return ctx->mel.n_len;
+    return ctx->state->mel.n_len;
 }
 int whisper_n_vocab(struct whisper_context * ctx) {
@@ -2815,7 +2945,12 @@ int whisper_is_multilingual(struct whisper_context * ctx) {
 }
 float * whisper_get_logits(struct whisper_context * ctx) {
-    return ctx->logits.data();
+    return ctx->state->logits.data();
+}
+float * whisper_get_logits_from_state(struct whisper_state * state) {
+    return state->logits.data();
 }
 const char * whisper_token_to_str(struct whisper_context * ctx, whisper_token token) {
@@ -2861,24 +2996,29 @@ whisper_token whisper_token_transcribe(void) {
 void whisper_print_timings(struct whisper_context * ctx) {
     const int64_t t_end_us = ggml_time_us();
-    const int32_t n_sample = std::max(1, ctx->n_sample);
-    const int32_t n_encode = std::max(1, ctx->n_encode);
-    const int32_t n_decode = std::max(1, ctx->n_decode);
     fprintf(stderr, "\n");
-    fprintf(stderr, "%s:     fallbacks = %3d p / %3d h\n", __func__, ctx->n_fail_p, ctx->n_fail_h);
-    fprintf(stderr, "%s:     load time = %8.2f ms\n", __func__, ctx->t_load_us/1000.0f);
-    fprintf(stderr, "%s:      mel time = %8.2f ms\n", __func__, ctx->t_mel_us/1000.0f);
-    fprintf(stderr, "%s:   sample time = %8.2f ms / %5d runs (%8.2f ms per run)\n", __func__, 1e-3f*ctx->t_sample_us, n_sample, 1e-3f*ctx->t_sample_us/n_sample);
-    fprintf(stderr, "%s:   encode time = %8.2f ms / %5d runs (%8.2f ms per run)\n", __func__, 1e-3f*ctx->t_encode_us, n_encode, 1e-3f*ctx->t_encode_us/n_encode);
-    fprintf(stderr, "%s:   decode time = %8.2f ms / %5d runs (%8.2f ms per run)\n", __func__, 1e-3f*ctx->t_decode_us, n_decode, 1e-3f*ctx->t_decode_us/n_decode);
+    fprintf(stderr, "%s:     load time = %8.2f ms\n", __func__, ctx->t_load_us / 1000.0f);
+    if (ctx->state != nullptr) {
+        const int32_t n_sample = std::max(1, ctx->state->n_sample);
+        const int32_t n_encode = std::max(1, ctx->state->n_encode);
+        const int32_t n_decode = std::max(1, ctx->state->n_decode);
+        fprintf(stderr, "%s:     fallbacks = %3d p / %3d h\n", __func__, ctx->state->n_fail_p, ctx->state->n_fail_h);
+        fprintf(stderr, "%s:      mel time = %8.2f ms\n", __func__, ctx->state->t_mel_us / 1000.0f);
+        fprintf(stderr, "%s:   sample time = %8.2f ms / %5d runs (%8.2f ms per run)\n", __func__, 1e-3f * ctx->state->t_sample_us, n_sample, 1e-3f * ctx->state->t_sample_us / n_sample);
+        fprintf(stderr, "%s:   encode time = %8.2f ms / %5d runs (%8.2f ms per run)\n", __func__, 1e-3f * ctx->state->t_encode_us, n_encode, 1e-3f * ctx->state->t_encode_us / n_encode);
+        fprintf(stderr, "%s:   decode time = %8.2f ms / %5d runs (%8.2f ms per run)\n", __func__, 1e-3f * ctx->state->t_decode_us, n_decode, 1e-3f * ctx->state->t_decode_us / n_decode);
+    }
     fprintf(stderr, "%s:    total time = %8.2f ms\n", __func__, (t_end_us - ctx->t_start_us)/1000.0f);
 }
 void whisper_reset_timings(struct whisper_context * ctx) {
-    ctx->t_sample_us = 0;
-    ctx->t_encode_us = 0;
-    ctx->t_decode_us = 0;
+    if (ctx->state != nullptr) {
+        ctx->state->t_sample_us = 0;
+        ctx->state->t_encode_us = 0;
+        ctx->state->t_decode_us = 0;
+    }
 }
 const char * whisper_print_system_info(void) {
@@ -2913,7 +3053,7 @@ struct whisper_full_params whisper_full_default_params(enum whisper_sampling_str
         /*.duration_ms      =*/ 0,
         /*.translate        =*/ false,
-        /*.no_context       =*/ false,
+        /*.no_context       =*/ true,
         /*.single_segment   =*/ false,
         /*.print_special    =*/ false,
         /*.print_progress   =*/ true,
@@ -2991,6 +3131,7 @@ struct whisper_full_params whisper_full_default_params(enum whisper_sampling_str
 static std::vector<float> get_signal_energy(const float * signal, int n_samples, int n_samples_per_half_window);
 static void whisper_exp_compute_token_level_timestamps(
         struct whisper_context & ctx,
+          struct whisper_state & state,
                            int   i_segment,
                          float   thold_pt,
                          float   thold_ptsum);
@@ -3023,8 +3164,8 @@ static inline bool should_split_on_word(const char * txt, bool split_on_word) {
 // wrap the last segment to max_len characters
 // returns the number of new segments
-static int whisper_wrap_segment(struct whisper_context & ctx, int max_len, bool split_on_word) {
-    auto segment = ctx.result_all.back();
+static int whisper_wrap_segment(struct whisper_context & ctx, struct whisper_state & state, int max_len, bool split_on_word) {
+    auto segment = state.result_all.back();
     int res = 1;
     int acc = 0;
@@ -3046,24 +3187,24 @@ static int whisper_wrap_segment(struct whisper_context & ctx, int max_len, bool
                 trim(text);
             }
-            ctx.result_all.back().text = std::move(text);
-            ctx.result_all.back().t1 = token.t0;
-            ctx.result_all.back().tokens.resize(i);
+            state.result_all.back().text = std::move(text);
+            state.result_all.back().t1 = token.t0;
+            state.result_all.back().tokens.resize(i);
-            ctx.result_all.push_back({});
-            ctx.result_all.back().t0 = token.t0;
-            ctx.result_all.back().t1 = segment.t1;
+            state.result_all.push_back({});
+            state.result_all.back().t0 = token.t0;
+            state.result_all.back().t1 = segment.t1;
             // add tokens [i, end] to the new segment
-            ctx.result_all.back().tokens.insert(
-                    ctx.result_all.back().tokens.end(),
+            state.result_all.back().tokens.insert(
+                state.result_all.back().tokens.end(),
                     segment.tokens.begin() + i,
                     segment.tokens.end());
             acc = 0;
             text = "";
-            segment = ctx.result_all.back();
+            segment = state.result_all.back();
             i = -1;
             res++;
@@ -3076,7 +3217,7 @@ static int whisper_wrap_segment(struct whisper_context & ctx, int max_len, bool
     if (split_on_word) {
         trim(text);
     }
-    ctx.result_all.back().text = std::move(text);
+    state.result_all.back().text = std::move(text);
     return res;
 }
@@ -3093,6 +3234,7 @@ static const std::vector<std::string> non_speech_tokens = {
 // - computes logprobs and probs
 static void whisper_process_logits(
               struct whisper_context & ctx,
+               struct whisper_state  & state,
     const struct whisper_full_params   params,
               struct whisper_decoder & decoder,
                                float   temperature) {
@@ -3111,7 +3253,7 @@ static void whisper_process_logits(
     auto & logprobs = decoder.logprobs;
     {
         logits.resize(n_logits);
-        memcpy(logits.data(), ctx.logits.data() + (ctx.logits.size() - n_logits), n_logits*sizeof(float));
+        memcpy(logits.data(), state.logits.data() + (state.logits.size() - n_logits), n_logits*sizeof(float));
         if (temperature > 0.0f) {
             for (int i = 0; i < n_logits; i++) {
@@ -3149,7 +3291,7 @@ static void whisper_process_logits(
         logits[vocab.token_transcribe] = -INFINITY;
         if (params.logits_filter_callback) {
-            params.logits_filter_callback(&ctx, tokens_cur.data(), tokens_cur.size(), logits.data(), params.logits_filter_callback_user_data);
+            params.logits_filter_callback(&ctx, &state, tokens_cur.data(), tokens_cur.size(), logits.data(), params.logits_filter_callback_user_data);
         }
         // suppress non-speech tokens
@@ -3310,6 +3452,7 @@ static void whisper_process_logits(
 static whisper_token_data whisper_sample_token(
             whisper_context & ctx,
+              whisper_state & state,
       const whisper_decoder & decoder,
                        bool   best) {
     whisper_token_data result = {
@@ -3354,7 +3497,7 @@ static whisper_token_data whisper_sample_token(
     } else {
         std::discrete_distribution<> dist(probs.begin(), probs.end());
-        result.id   = dist(ctx.rng);
+        result.id   = dist(state.rng);
         result.p    = probs[result.id];
         result.plog = logprobs[result.id];
     }
@@ -3364,13 +3507,14 @@ static whisper_token_data whisper_sample_token(
         result.pt  = result.p;
     }
-    ctx.n_sample++;
+    state.n_sample++;
     return result;
 }
 static std::vector<whisper_token_data> whisper_sample_token_topk(
             whisper_context & ctx,
+              whisper_state & state,
       const whisper_decoder & decoder,
                         int   k) {
     const auto & vocab = ctx.vocab;
@@ -3381,7 +3525,7 @@ static std::vector<whisper_token_data> whisper_sample_token_topk(
     const int n_logits = vocab.n_vocab;
-    auto & logits_id = ctx.logits_id;
+    auto & logits_id = state.logits_id;
     logits_id.clear();
     for (int i = 0; i < n_logits; ++i) {
@@ -3434,7 +3578,7 @@ static std::vector<whisper_token_data> whisper_sample_token_topk(
         }
     }
-    ctx.n_sample++;
+    state.n_sample++;
     return result;
 }
@@ -3488,24 +3632,25 @@ static void whisper_sequence_score(
     }
 }
-int whisper_full(
+int whisper_full_with_state(
         struct whisper_context * ctx,
-        struct whisper_full_params params,
-        const float * samples,
-        int n_samples) {
+          struct whisper_state * state,
+    struct whisper_full_params   params,
+                   const float * samples,
+                           int   n_samples) {
     // clear old results
-    auto & result_all = ctx->result_all;
+    auto & result_all = state->result_all;
     result_all.clear();
     // compute log mel spectrogram
     if (params.speed_up) {
-        if (whisper_pcm_to_mel_phase_vocoder(ctx, samples, n_samples, params.n_threads) != 0) {
+        if (whisper_pcm_to_mel_phase_vocoder_with_state(ctx, state, samples, n_samples, params.n_threads) != 0) {
             fprintf(stderr, "%s: failed to compute log mel spectrogram\n", __func__);
             return -1;
         }
     } else {
-        if (whisper_pcm_to_mel(ctx, samples, n_samples, params.n_threads) != 0) {
+        if (whisper_pcm_to_mel_with_state(ctx, state, samples, n_samples, params.n_threads) != 0) {
             fprintf(stderr, "%s: failed to compute log mel spectrogram\n", __func__);
             return -2;
         }
@@ -3515,26 +3660,26 @@ int whisper_full(
     if (params.language == nullptr || strlen(params.language) == 0 || strcmp(params.language, "auto") == 0) {
         std::vector<float> probs(whisper_lang_max_id() + 1, 0.0f);
-        const auto lang_id = whisper_lang_auto_detect(ctx, 0, params.n_threads, probs.data());
+        const auto lang_id = whisper_lang_auto_detect_with_state(ctx, state, 0, params.n_threads, probs.data());
         if (lang_id < 0) {
             fprintf(stderr, "%s: failed to auto-detect language\n", __func__);
             return -3;
         }
-        ctx->lang_id = lang_id;
+        state->lang_id = lang_id;
         params.language = whisper_lang_str(lang_id);
         fprintf(stderr, "%s: auto-detected language: %s (p = %f)\n", __func__, params.language, probs[whisper_lang_id(params.language)]);
     }
     if (params.token_timestamps) {
-        ctx->t_beg    = 0;
-        ctx->t_last   = 0;
-        ctx->tid_last = 0;
-        ctx->energy = get_signal_energy(samples, n_samples, 32);
+        state->t_beg    = 0;
+        state->t_last   = 0;
+        state->tid_last = 0;
+        state->energy = get_signal_energy(samples, n_samples, 32);
     }
     const int seek_start = params.offset_ms/10;
-    const int seek_end = seek_start + (params.duration_ms == 0 ? whisper_n_len(ctx) : params.duration_ms/10);
+    const int seek_end = seek_start + (params.duration_ms == 0 ? whisper_n_len_from_state(state) : params.duration_ms/10);
     // if length of spectrogram is less than 1s (100 samples), then return
     // basically don't process anything that is less than 1s
@@ -3572,10 +3717,10 @@ int whisper_full(
     // TAGS: WHISPER_DECODER_INIT
     for (int j = 1; j < n_decoders; j++) {
-        auto & decoder = ctx->decoders[j];
+        auto & decoder = state->decoders[j];
         if (decoder.kv_self.ctx == nullptr) {
-            decoder.kv_self = ctx->decoders[0].kv_self;
+            decoder.kv_self = state->decoders[0].kv_self;
             if (!kv_cache_reinit(decoder.kv_self)) {
                 fprintf(stderr, "%s: kv_cache_reinit() failed for self-attention, decoder %d\n", __func__, j);
                 return -4;
@@ -3583,7 +3728,7 @@ int whisper_full(
             WHISPER_PRINT_DEBUG("%s: initialized self-attention kv cache, decoder %d\n", __func__, j);
-            decoder.sequence.tokens.reserve(ctx->decoders[0].sequence.tokens.capacity());
+            decoder.sequence.tokens.reserve(state->decoders[0].sequence.tokens.capacity());
             decoder.probs.resize   (ctx->vocab.n_vocab);
             decoder.logits.resize  (ctx->vocab.n_vocab);
@@ -3592,7 +3737,7 @@ int whisper_full(
     }
     // the accumulated text context so far
-    auto & prompt_past = ctx->prompt_past;
+    auto & prompt_past = state->prompt_past;
     if (params.no_context) {
         prompt_past.clear();
     }
@@ -3611,13 +3756,13 @@ int whisper_full(
         fprintf(stderr, "%s: audio_ctx is larger than the maximum allowed (%d > %d)\n", __func__, params.audio_ctx, whisper_n_audio_ctx(ctx));
         return -5;
     }
-    ctx->exp_n_audio_ctx = params.audio_ctx;
+    state->exp_n_audio_ctx = params.audio_ctx;
     // these tokens determine the task that will be performed
     std::vector<whisper_token> prompt_init = { whisper_token_sot(ctx) };
     if (whisper_is_multilingual(ctx)) {
         const int lang_id = whisper_lang_id(params.language);
-        ctx->lang_id = lang_id;
+        state->lang_id = lang_id;
         prompt_init.push_back(whisper_token_lang(ctx, lang_id));
         if (params.translate) {
             prompt_init.push_back(whisper_token_translate());
@@ -3669,14 +3814,14 @@ int whisper_full(
         }
         if (params.encoder_begin_callback) {
-            if (params.encoder_begin_callback(ctx, params.encoder_begin_callback_user_data) == false) {
+            if (params.encoder_begin_callback(ctx, state, params.encoder_begin_callback_user_data) == false) {
                 fprintf(stderr, "%s: encoder_begin_callback returned false - aborting\n", __func__);
                 break;
             }
         }
         // encode audio features starting at offset seek
-        if (!whisper_encode(*ctx, seek, params.n_threads)) {
+        if (!whisper_encode_internal(*ctx, *state, seek, params.n_threads)) {
             fprintf(stderr, "%s: failed to encode\n", __func__);
             return -6;
         }
@@ -3717,7 +3862,7 @@ int whisper_full(
             // TAGS: WHISPER_DECODER_INIT
             for (int j = 0; j < n_decoders_cur; ++j) {
-                auto & decoder = ctx->decoders[j];
+                auto & decoder = state->decoders[j];
                 decoder.kv_self.n = 0;
@@ -3759,7 +3904,7 @@ int whisper_full(
                 }
                 WHISPER_PRINT_DEBUG("\n\n");
-                if (!whisper_decode(*ctx, ctx->decoders[0], prompt.data(), prompt.size(), 0, params.n_threads)) {
+                if (!whisper_decode_internal(*ctx, *state, state->decoders[0], prompt.data(), prompt.size(), 0, params.n_threads)) {
                     fprintf(stderr, "%s: failed to decode\n", __func__);
                     return -7;
                 }
@@ -3767,24 +3912,24 @@ int whisper_full(
                 {
                     const int64_t t_start_sample_us = ggml_time_us();
-                    whisper_process_logits(*ctx, params, ctx->decoders[0], t_cur);
+                    whisper_process_logits(*ctx, *state, params, state->decoders[0], t_cur);
-                    ctx->decoders[0].kv_self.n += prompt.size();
+                    state->decoders[0].kv_self.n += prompt.size();
                     for (int j = 1; j < n_decoders_cur; ++j) {
-                        auto & decoder = ctx->decoders[j];
+                        auto & decoder = state->decoders[j];
-                        memcpy(decoder.kv_self.k->data, ctx->decoders[0].kv_self.k->data, ggml_nbytes(decoder.kv_self.k));
-                        memcpy(decoder.kv_self.v->data, ctx->decoders[0].kv_self.v->data, ggml_nbytes(decoder.kv_self.v));
+                        memcpy(decoder.kv_self.k->data, state->decoders[0].kv_self.k->data, ggml_nbytes(decoder.kv_self.k));
+                        memcpy(decoder.kv_self.v->data, state->decoders[0].kv_self.v->data, ggml_nbytes(decoder.kv_self.v));
                         decoder.kv_self.n += prompt.size();
-                        memcpy(decoder.probs.data(),    ctx->decoders[0].probs.data(),    decoder.probs.size()*sizeof(decoder.probs[0]));
-                        memcpy(decoder.logits.data(),   ctx->decoders[0].logits.data(),   decoder.logits.size()*sizeof(decoder.logits[0]));
-                        memcpy(decoder.logprobs.data(), ctx->decoders[0].logprobs.data(), decoder.logprobs.size()*sizeof(decoder.logprobs[0]));
+                        memcpy(decoder.probs.data(), state->decoders[0].probs.data(),    decoder.probs.size()*sizeof(decoder.probs[0]));
+                        memcpy(decoder.logits.data(), state->decoders[0].logits.data(),   decoder.logits.size()*sizeof(decoder.logits[0]));
+                        memcpy(decoder.logprobs.data(), state->decoders[0].logprobs.data(), decoder.logprobs.size()*sizeof(decoder.logprobs[0]));
                     }
-                    ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
+                    state->t_sample_us += ggml_time_us() - t_start_sample_us;
                 }
             }
@@ -3795,7 +3940,7 @@ int whisper_full(
                 if (params.strategy == whisper_sampling_strategy::WHISPER_SAMPLING_BEAM_SEARCH) {
                     kv_bufs.resize(n_decoders_cur);
                     for (int j = 0; j < n_decoders_cur; ++j) {
-                        auto & decoder = ctx->decoders[j];
+                        auto & decoder = state->decoders[j];
                         if (decoder.completed || decoder.failed) {
                             continue;
@@ -3813,7 +3958,7 @@ int whisper_full(
                 // generate new sequence candidates for each decoder
                 for (int j = 0; j < n_decoders_cur; ++j) {
-                    auto & decoder = ctx->decoders[j];
+                    auto & decoder = state->decoders[j];
                     if (decoder.completed || decoder.failed) {
                         continue;
@@ -3823,16 +3968,16 @@ int whisper_full(
                         case whisper_sampling_strategy::WHISPER_SAMPLING_GREEDY:
                             {
                                 if (t_cur < 1e-6f) {
-                                    decoder.sequence.tokens.push_back(whisper_sample_token(*ctx, decoder, true));
+                                    decoder.sequence.tokens.push_back(whisper_sample_token(*ctx, *state, decoder, true));
                                 } else {
-                                    decoder.sequence.tokens.push_back(whisper_sample_token(*ctx, decoder, false));
+                                    decoder.sequence.tokens.push_back(whisper_sample_token(*ctx, *state, decoder, false));
                                 }
                                 decoder.sequence.sum_logprobs_all += decoder.sequence.tokens.back().plog;
                             } break;
                         case whisper_sampling_strategy::WHISPER_SAMPLING_BEAM_SEARCH:
                             {
-                                const auto tokens_new = whisper_sample_token_topk(*ctx, decoder, params.beam_search.beam_size);
+                                const auto tokens_new = whisper_sample_token_topk(*ctx, *state, decoder, params.beam_search.beam_size);
                                 for (const auto & token : tokens_new) {
                                     beam_candidates.push_back({ j, decoder.seek_delta, decoder.has_ts, decoder.sequence });
@@ -3857,7 +4002,7 @@ int whisper_full(
                     uint32_t cur_c = 0;
                     for (int j = 0; j < n_decoders_cur; ++j) {
-                        auto & decoder = ctx->decoders[j];
+                        auto & decoder = state->decoders[j];
                         if (decoder.completed || decoder.failed) {
                             continue;
@@ -3886,7 +4031,7 @@ int whisper_full(
                 // - check if the sequence is failed
                 // - update sliding window based on timestamp tokens
                 for (int j = 0; j < n_decoders_cur; ++j) {
-                    auto & decoder = ctx->decoders[j];
+                    auto & decoder = state->decoders[j];
                     if (decoder.completed || decoder.failed) {
                         continue;
@@ -3968,7 +4113,7 @@ int whisper_full(
                     bool completed_all = true;
                     for (int j = 0; j < n_decoders_cur; ++j) {
-                        auto & decoder = ctx->decoders[j];
+                        auto & decoder = state->decoders[j];
                         if (decoder.completed || decoder.failed) {
                             continue;
@@ -3982,11 +4127,11 @@ int whisper_full(
                     }
                 }
-                ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
+                state->t_sample_us += ggml_time_us() - t_start_sample_us;
                 // obtain logits for the next token
                 for (int j = 0; j < n_decoders_cur; ++j) {
-                    auto & decoder = ctx->decoders[j];
+                    auto & decoder = state->decoders[j];
                     if (decoder.failed || decoder.completed) {
                         continue;
@@ -3997,7 +4142,7 @@ int whisper_full(
                     //WHISPER_PRINT_DEBUG("%s: decoder %d: token %d, kv_self.n %d, seek_delta %d\n", __func__, j, decoder.tokens_tmp[0], decoder.kv_self.n, decoder.seek_delta);
-                    if (!whisper_decode(*ctx, decoder, decoder.tokens_tmp.data(), decoder.tokens_tmp.size(), decoder.kv_self.n, params.n_threads)) {
+                    if (!whisper_decode_internal(*ctx, *state, decoder, decoder.tokens_tmp.data(), decoder.tokens_tmp.size(), decoder.kv_self.n, params.n_threads)) {
                         fprintf(stderr, "%s: failed to decode\n", __func__);
                         return -8;
                     }
@@ -4005,11 +4150,11 @@ int whisper_full(
                     {
                         const int64_t t_start_sample_us = ggml_time_us();
-                        whisper_process_logits(*ctx, params, decoder, t_cur);
+                        whisper_process_logits(*ctx, *state, params, decoder, t_cur);
                         ++decoder.kv_self.n;
-                        ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
+                        state->t_sample_us += ggml_time_us() - t_start_sample_us;
                     }
                 }
             }
@@ -4019,7 +4164,7 @@ int whisper_full(
                 double best_score = -INFINITY;
                 for (int j = 0; j < n_decoders_cur; ++j) {
-                    auto & decoder = ctx->decoders[j];
+                    auto & decoder = state->decoders[j];
                     if (decoder.failed) {
                         continue;
@@ -4036,7 +4181,7 @@ int whisper_full(
                                 __func__, j, decoder.sequence.entropy, params.entropy_thold);
                         decoder.failed = true;
-                        ctx->n_fail_h++;
+                        state->n_fail_h++;
                         continue;
                     }
@@ -4054,11 +4199,11 @@ int whisper_full(
             {
                 bool success = true;
-                const auto & decoder = ctx->decoders[best_decoder_id];
+                const auto & decoder = state->decoders[best_decoder_id];
                 if (decoder.failed || decoder.sequence.avg_logprobs < params.logprob_thold) {
                     success = false;
-                    ctx->n_fail_p++;
+                    state->n_fail_p++;
                 }
                 if (success) {
@@ -4075,7 +4220,7 @@ int whisper_full(
         // output results through a user-provided callback
         {
-            const auto & best_decoder = ctx->decoders[best_decoder_id];
+            const auto & best_decoder = state->decoders[best_decoder_id];
             const auto seek_delta = best_decoder.seek_delta;
             const auto result_len = best_decoder.sequence.result_len;
@@ -4138,14 +4283,14 @@ int whisper_full(
                             if (params.token_timestamps) {
                                 whisper_exp_compute_token_level_timestamps(
-                                        *ctx, result_all.size() - 1, params.thold_pt, params.thold_ptsum);
+                                        *ctx, *state, result_all.size() - 1, params.thold_pt, params.thold_ptsum);
                                 if (params.max_len > 0) {
-                                    n_new = whisper_wrap_segment(*ctx, params.max_len, params.split_on_word);
+                                    n_new = whisper_wrap_segment(*ctx, *state, params.max_len, params.split_on_word);
                                 }
                             }
                             if (params.new_segment_callback) {
-                                params.new_segment_callback(ctx, n_new, params.new_segment_callback_user_data);
+                                params.new_segment_callback(ctx, state, n_new, params.new_segment_callback_user_data);
                             }
                         }
                         text = "";
@@ -4182,14 +4327,14 @@ int whisper_full(
                     if (params.token_timestamps) {
                         whisper_exp_compute_token_level_timestamps(
-                                *ctx, result_all.size() - 1, params.thold_pt, params.thold_ptsum);
+                                *ctx, *state, result_all.size() - 1, params.thold_pt, params.thold_ptsum);
                         if (params.max_len > 0) {
-                            n_new = whisper_wrap_segment(*ctx, params.max_len, params.split_on_word);
+                            n_new = whisper_wrap_segment(*ctx, *state, params.max_len, params.split_on_word);
                         }
                     }
                     if (params.new_segment_callback) {
-                        params.new_segment_callback(ctx, n_new, params.new_segment_callback_user_data);
+                        params.new_segment_callback(ctx, state, n_new, params.new_segment_callback_user_data);
                     }
                 }
             }
@@ -4204,6 +4349,15 @@ int whisper_full(
     return 0;
 }
+int whisper_full(
+        struct whisper_context * ctx,
+    struct whisper_full_params   params,
+                   const float * samples,
+                           int   n_samples) {
+    return whisper_full_with_state(ctx, ctx->state, params, samples, n_samples);
+}
 int whisper_full_parallel(
         struct whisper_context * ctx,
         struct whisper_full_params params,
@@ -4213,40 +4367,10 @@ int whisper_full_parallel(
     if (n_processors == 1) {
         return whisper_full(ctx, params, samples, n_samples);
     }
     int ret = 0;
-    // prepare separate contexts for each thread
-    std::vector<struct whisper_context> ctxs(n_processors - 1);
-    for (int i = 0; i < n_processors - 1; ++i) {
-        auto & ctx_p = ctxs[i];
-        ctx_p = *ctx;
-        ctx_p.logits.reserve(ctx_p.vocab.n_vocab*ctx_p.model.hparams.n_text_ctx);
-        ctx_p.logits_id.reserve(ctx_p.vocab.n_vocab);
-        if (!kv_cache_reinit(ctx_p.kv_cross)) {
-            fprintf(stderr, "%s: kv_cache_reinit() failed for cross-attention, processor %d\n", __func__, i);
-            return false;
-        }
-        // TAGS: WHISPER_DECODER_INIT
-        for (int j = 0; j < WHISPER_MAX_DECODERS; ++j) {
-            if (ctx_p.decoders[j].kv_self.ctx && !kv_cache_reinit(ctx_p.decoders[j].kv_self)) {
-                fprintf(stderr, "%s: kv_cache_reinit() failed for self-attention, decoder %d, processor %d\n", __func__, j, i);
-                return false;
-            }
-            ctx_p.decoders[j].sequence.tokens.reserve(ctx_p.model.hparams.n_text_ctx);
-            ctx_p.decoders[j].probs.reserve   (ctx_p.vocab.n_vocab);
-            ctx_p.decoders[j].logits.reserve  (ctx_p.vocab.n_vocab);
-            ctx_p.decoders[j].logprobs.reserve(ctx_p.vocab.n_vocab);
-        }
-    }
+    // prepare separate states for each thread
+    std::vector<whisper_state*> states;
     const int offset_samples = (WHISPER_SAMPLE_RATE*params.offset_ms)/1000;
     const int n_samples_per_processor = (n_samples - offset_samples)/n_processors;
@@ -4256,6 +4380,9 @@ int whisper_full_parallel(
     std::vector<std::thread> workers(n_processors - 1);
     for (int i = 0; i < n_processors - 1; ++i) {
+        // create a new state for each thread
+        states.push_back(whisper_init_state(ctx));
         const int start_samples = offset_samples + (i + 1)*n_samples_per_processor;
         const int n_samples_cur = (i == n_processors - 2) ? n_samples - start_samples : n_samples_per_processor;
@@ -4268,13 +4395,17 @@ int whisper_full_parallel(
         params_cur.new_segment_callback = nullptr;
         params_cur.new_segment_callback_user_data = nullptr;
-        workers[i] = std::thread(whisper_full, &ctxs[i], std::move(params_cur), samples + start_samples, n_samples_cur);
+        workers[i] = std::thread(whisper_full_with_state, ctx, states[i], std::move(params_cur), samples + start_samples, n_samples_cur);
     }
     {
         auto params_cur = params;
-        ret = whisper_full(ctx, std::move(params_cur), samples, offset_samples + n_samples_per_processor);
+        // We need to disable the print real-time for this one as well, otherwise it will show only for the first chunk.
+        params_cur.print_realtime = false;
+        // Run the first transformation using default state but only for the first chunk.
+        ret = whisper_full_with_state(ctx, ctx->state, std::move(params_cur), samples, offset_samples + n_samples_per_processor);
     }
     for (int i = 0; i < n_processors - 1; ++i) {
@@ -4283,45 +4414,43 @@ int whisper_full_parallel(
     const int64_t offset_t = (int64_t) params.offset_ms/10.0;
-    // combine results into ctx->result_all
+    // combine results into result_state->result_all from all other states
     for (int i = 0; i < n_processors - 1; ++i) {
-        auto & results_i = ctxs[i].result_all;
+        auto& results_i = states[i]->result_all;
-        for (auto & result : results_i) {
+        for (auto& result : results_i) {
             // correct the segment timestamp taking into account the offset
-            result.t0 += 100*((i + 1)*n_samples_per_processor)/WHISPER_SAMPLE_RATE + offset_t;
-            result.t1 += 100*((i + 1)*n_samples_per_processor)/WHISPER_SAMPLE_RATE + offset_t;
+            result.t0 += 100 * ((i + 1) * n_samples_per_processor) / WHISPER_SAMPLE_RATE + offset_t;
+            result.t1 += 100 * ((i + 1) * n_samples_per_processor) / WHISPER_SAMPLE_RATE + offset_t;
             // make sure that segments are not overlapping
-            if (!ctx->result_all.empty()) {
-                result.t0 = std::max(result.t0, ctx->result_all.back().t1);
+            if (!ctx->state->result_all.empty()) {
+                result.t0 = std::max(result.t0, ctx->state->result_all.back().t1);
             }
-            ctx->result_all.push_back(std::move(result));
+            ctx->state->result_all.push_back(std::move(result));
             // call the new_segment_callback for each segment
             if (params.new_segment_callback) {
-                params.new_segment_callback(ctx, 1, params.new_segment_callback_user_data);
+                params.new_segment_callback(ctx, ctx->state, 1, params.new_segment_callback_user_data);
             }
         }
-        ctx->t_mel_us    += ctxs[i].t_mel_us;
-        ctx->t_sample_us += ctxs[i].t_sample_us;
-        ctx->t_encode_us += ctxs[i].t_encode_us;
-        ctx->t_decode_us += ctxs[i].t_decode_us;
+        ctx->state->t_mel_us += states[i]->t_mel_us;
-        kv_cache_free(ctx->kv_cross);
+        ctx->state->t_sample_us += states[i]->t_sample_us;
+        ctx->state->t_encode_us += states[i]->t_encode_us;
+        ctx->state->t_decode_us += states[i]->t_decode_us;
-        for (int j = 0; j < WHISPER_MAX_DECODERS; ++j) {
-            kv_cache_free(ctx->decoders[j].kv_self);
-        }
+        whisper_free_state(states[i]);
     }
     // average the timings
-    ctx->t_mel_us    /= n_processors;
-    ctx->t_sample_us /= n_processors;
-    ctx->t_encode_us /= n_processors;
-    ctx->t_decode_us /= n_processors;
+    ctx->state->t_mel_us    /= n_processors;
+    ctx->state->t_sample_us /= n_processors;
+    ctx->state->t_encode_us /= n_processors;
+    ctx->state->t_decode_us /= n_processors;
     // print information about the audio boundaries
     fprintf(stderr, "\n");
@@ -4334,44 +4463,84 @@ int whisper_full_parallel(
     return ret;
 }
+int whisper_full_n_segments_from_state(struct whisper_state * state) {
+    return state->result_all.size();
+}
 int whisper_full_n_segments(struct whisper_context * ctx) {
-    return ctx->result_all.size();
+    return ctx->state->result_all.size();
+}
+int whisper_full_lang_id_from_state(struct whisper_state * state) {
+    return state->lang_id;
 }
 int whisper_full_lang_id(struct whisper_context * ctx) {
-    return ctx->lang_id;
+    return ctx->state->lang_id;
+}
+int64_t whisper_full_get_segment_t0_from_state(struct whisper_state * state, int i_segment) {
+    return state->result_all[i_segment].t0;
 }
 int64_t whisper_full_get_segment_t0(struct whisper_context * ctx, int i_segment) {
-    return ctx->result_all[i_segment].t0;
+    return ctx->state->result_all[i_segment].t0;
+}
+int64_t whisper_full_get_segment_t1_from_state(struct whisper_state * state, int i_segment) {
+    return state->result_all[i_segment].t1;
 }
 int64_t whisper_full_get_segment_t1(struct whisper_context * ctx, int i_segment) {
-    return ctx->result_all[i_segment].t1;
+    return ctx->state->result_all[i_segment].t1;
+}
+const char * whisper_full_get_segment_text_from_state(struct whisper_state * state, int i_segment) {
+    return state->result_all[i_segment].text.c_str();
 }
 const char * whisper_full_get_segment_text(struct whisper_context * ctx, int i_segment) {
-    return ctx->result_all[i_segment].text.c_str();
+    return ctx->state->result_all[i_segment].text.c_str();
+}
+int whisper_full_n_tokens_from_state(struct whisper_state * state, int i_segment) {
+    return state->result_all[i_segment].tokens.size();
 }
 int whisper_full_n_tokens(struct whisper_context * ctx, int i_segment) {
-    return ctx->result_all[i_segment].tokens.size();
+    return ctx->state->result_all[i_segment].tokens.size();
+}
+const char * whisper_full_get_token_text_from_state(struct whisper_context * ctx, struct whisper_state * state, int i_segment, int i_token) {
+    return ctx->vocab.id_to_token[state->result_all[i_segment].tokens[i_token].id].c_str();
+}
+const char* whisper_full_get_token_text(struct whisper_context * ctx, int i_segment, int i_token) {
+    return ctx->vocab.id_to_token[ctx->state->result_all[i_segment].tokens[i_token].id].c_str();
 }
-const char * whisper_full_get_token_text(struct whisper_context * ctx, int i_segment, int i_token) {
-    return ctx->vocab.id_to_token[ctx->result_all[i_segment].tokens[i_token].id].c_str();
+whisper_token whisper_full_get_token_id_from_state(struct whisper_state * state, int i_segment, int i_token) {
+    return state->result_all[i_segment].tokens[i_token].id;
 }
 whisper_token whisper_full_get_token_id(struct whisper_context * ctx, int i_segment, int i_token) {
-    return ctx->result_all[i_segment].tokens[i_token].id;
+    return ctx->state->result_all[i_segment].tokens[i_token].id;
+}
+struct whisper_token_data whisper_full_get_token_data_from_state(struct whisper_state * state, int i_segment, int i_token) {
+    return state->result_all[i_segment].tokens[i_token];
 }
 struct whisper_token_data whisper_full_get_token_data(struct whisper_context * ctx, int i_segment, int i_token) {
-    return ctx->result_all[i_segment].tokens[i_token];
+    return ctx->state->result_all[i_segment].tokens[i_token];
+}
+float whisper_full_get_token_p_from_state(struct whisper_state * state, int i_segment, int i_token) {
+    return state->result_all[i_segment].tokens[i_token].p;
 }
 float whisper_full_get_token_p(struct whisper_context * ctx, int i_segment, int i_token) {
-    return ctx->result_all[i_segment].tokens[i_token].p;
+    return ctx->state->result_all[i_segment].tokens[i_token].p;
 }
 // =================================================================================================
@@ -4382,6 +4551,15 @@ float whisper_full_get_token_p(struct whisper_context * ctx, int i_segment, int
 //
 WHISPER_API int whisper_bench_memcpy(int n_threads) {
+    fputs(whisper_bench_memcpy_str(n_threads), stderr);
+    return 0;
+}
+WHISPER_API const char * whisper_bench_memcpy_str(int n_threads) {
+    static std::string s;
+    s = "";
+    char strbuf[256];
     ggml_time_init();
     size_t n    = 50;
@@ -4411,7 +4589,8 @@ WHISPER_API int whisper_bench_memcpy(int n_threads) {
         src[0] = rand();
     }
-    fprintf(stderr, "memcpy: %.2f GB/s\n", (double) (n*size)/(tsum*1024llu*1024llu*1024llu));
+    snprintf(strbuf, sizeof(strbuf), "memcpy: %.2f GB/s\n", (double) (n*size)/(tsum*1024llu*1024llu*1024llu));
+    s += strbuf;
     // needed to prevent the compile from optimizing the memcpy away
     {
@@ -4419,16 +4598,26 @@ WHISPER_API int whisper_bench_memcpy(int n_threads) {
         for (size_t i = 0; i < size; i++) sum += dst[i];
-        fprintf(stderr, "sum:    %s %f\n", sum == -536870910.00 ? "ok" : "error", sum);
+        snprintf(strbuf, sizeof(strbuf), "sum:    %s %f\n", sum == -536870910.00 ? "ok" : "error", sum);
+        s += strbuf;
     }
     free(src);
     free(dst);
-    return 0;
+    return s.c_str();
 }
 WHISPER_API int whisper_bench_ggml_mul_mat(int n_threads) {
+    fputs(whisper_bench_ggml_mul_mat_str(n_threads), stderr);
+    return 0;
+}
+WHISPER_API const char * whisper_bench_ggml_mul_mat_str(int n_threads) {
+    static std::string s;
+    s = "";
+    char strbuf[256];
     ggml_time_init();
     const int n_max = 128;
@@ -4504,11 +4693,12 @@ WHISPER_API int whisper_bench_ggml_mul_mat(int n_threads) {
             s = ((2.0*N*N*N*n)/tsum)*1e-9;
         }
-        fprintf(stderr, "ggml_mul_mat: %5zu x %5zu: F16 %8.1f GFLOPS (%3d runs) / F32 %8.1f GFLOPS (%3d runs)\n",
+        snprintf(strbuf, sizeof(strbuf), "ggml_mul_mat: %5zu x %5zu: F16 %8.1f GFLOPS (%3d runs) / F32 %8.1f GFLOPS (%3d runs)\n",
             N, N, s_fp16, n_fp16, s_fp32, n_fp32);
+        s += strbuf;
     }
-    return 0;
+    return s.c_str();
 }
 // =================================================================================================
@@ -4583,13 +4773,14 @@ static std::vector<float> get_signal_energy(const float * signal, int n_samples,
 static void whisper_exp_compute_token_level_timestamps(
         struct whisper_context & ctx,
+          struct whisper_state & state,
                            int   i_segment,
                          float   thold_pt,
                          float   thold_ptsum) {
-    auto & segment = ctx.result_all[i_segment];
+    auto & segment = state.result_all[i_segment];
     auto & tokens  = segment.tokens;
-    const int n_samples = ctx.energy.size();
+    const int n_samples = state.energy.size();
     if (n_samples == 0) {
         fprintf(stderr, "%s: no signal data available\n", __func__);
@@ -4612,9 +4803,9 @@ static void whisper_exp_compute_token_level_timestamps(
         return;
     }
-    auto & t_beg    = ctx.t_beg;
-    auto & t_last   = ctx.t_last;
-    auto & tid_last = ctx.tid_last;
+    auto & t_beg    = state.t_beg;
+    auto & t_last   = state.t_last;
+    auto & tid_last = state.tid_last;
     for (int j = 0; j < n; ++j) {
         auto & token = tokens[j];
@@ -4737,15 +4928,15 @@ static void whisper_exp_compute_token_level_timestamps(
             float sum = 0.0f;
             for (int k = ss0; k < ss1; k++) {
-                sum += ctx.energy[k];
+                sum += state.energy[k];
             }
             const float thold = 0.5*sum/ns;
             {
                 int k = s0;
-                if (ctx.energy[k] > thold && j > 0) {
-                    while (k > 0 && ctx.energy[k] > thold) {
+                if (state.energy[k] > thold && j > 0) {
+                    while (k > 0 && state.energy[k] > thold) {
                         k--;
                     }
                     tokens[j].t0 = sample_to_timestamp(k);
@@ -4755,7 +4946,7 @@ static void whisper_exp_compute_token_level_timestamps(
                         s0 = k;
                     }
                 } else {
-                    while (ctx.energy[k] < thold && k < s1) {
+                    while (state.energy[k] < thold && k < s1) {
                         k++;
                     }
                     s0 = k;
@@ -4765,8 +4956,8 @@ static void whisper_exp_compute_token_level_timestamps(
             {
                 int k = s1;
-                if (ctx.energy[k] > thold) {
-                    while (k < n_samples - 1 && ctx.energy[k] > thold) {
+                if (state.energy[k] > thold) {
+                    while (k < n_samples - 1 && state.energy[k] > thold) {
                         k++;
                     }
                     tokens[j].t1 = sample_to_timestamp(k);
@@ -4776,7 +4967,7 @@ static void whisper_exp_compute_token_level_timestamps(
                         s1 = k;
                     }
                 } else {
-                    while (ctx.energy[k] < thold && k > s0) {
+                    while (state.energy[k] < thold && k > s0) {
                         k--;
                     }
                     s1 = k;