npm - @tekyzinc/stt-component - Versions diffs - 0.2.5 → 0.3.1 - Mend

@tekyzinc/stt-component 0.2.5 → 0.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

package/dist/index.d.cts CHANGED Viewed

@@ -14,9 +14,9 @@ interface STTCorrectionConfig {
     enabled?: boolean;
     /** Correction engine provider. Default: 'whisper' */
     provider?: STTCorrectionProvider;
-    /** Silence duration (ms) before triggering correction. Default: 3000 */
+    /** Silence duration (ms) before triggering correction. Default: 1000 */
     pauseThreshold?: number;
-    /** Maximum interval (ms) between forced corrections. Default: 5000 */
+    /** Maximum interval (ms) between forced corrections. Default: 3000 */
     forcedInterval?: number;
 }
 /** Real-time streaming preview configuration. */
@@ -95,6 +95,10 @@ interface AudioCaptureHandle {
     samples: Float32Array[];
     /** Retain reference to prevent GC from stopping audio processing. */
     _processor: ScriptProcessorNode;
+    /** Source node for disconnect/reconnect on pause/resume. */
+    _source: MediaStreamAudioSourceNode;
+    /** Gain node (silent) to prevent mic playback. */
+    _silencer: GainNode;
 }
 /** Default configuration values. */
 declare const DEFAULT_STT_CONFIG: ResolvedSTTConfig;
@@ -124,6 +128,18 @@ declare class TypedEventEmitter<T extends Record<string, (...args: any[]) => voi
  * Uses ScriptProcessorNode to collect Float32Array samples directly.
  */
 declare function startCapture(): Promise<AudioCaptureHandle>;
+/**
+ * Pause capture without releasing mic or AudioContext.
+ * Disconnects the audio source so no new samples are collected.
+ * Returns resampled audio from the recording period.
+ * Call resumeCapture() to start collecting again.
+ */
+declare function pauseCapture(capture: AudioCaptureHandle): Promise<Float32Array>;
+/**
+ * Resume a paused capture. Reconnects the audio source to the processor.
+ * AudioContext is resumed if suspended.
+ */
+declare function resumeCapture(capture: AudioCaptureHandle): Promise<void>;
 /**
  * Copy current audio buffer without stopping capture.
  * Returns a shallow copy of the samples array (each chunk is shared, not cloned).
@@ -152,8 +168,13 @@ type WorkerManagerEvents = {
 declare class WorkerManager extends TypedEventEmitter<WorkerManagerEvents> {
     private worker;
     private transcribeResolve;
+    private currentTranscribePromise;
     private modelReadyResolve;
     private modelReadyReject;
+    /** True while a transcription job is running in the worker. */
+    get isTranscribing(): boolean;
+    /** Await the current in-flight transcription without starting a new one. */
+    awaitCurrentTranscription(): Promise<string>;
     /** Spawn the Web Worker. Must be called before loadModel/transcribe. */
     spawn(workerUrl?: URL): void;
     /** Load the Whisper model in the worker. Resolves when ready. */
@@ -167,12 +188,9 @@ declare class WorkerManager extends TypedEventEmitter<WorkerManagerEvents> {
     private handleMessage;
 }
-/**
- * Manages mid-recording correction timing.
- * Two triggers: pause detection and forced interval.
- */
 declare class CorrectionOrchestrator {
     private forcedTimer;
+    private initialTimer;
     private lastCorrectionTime;
     private correctionFn;
     private config;
@@ -180,7 +198,9 @@ declare class CorrectionOrchestrator {
     constructor(config: ResolvedSTTConfig['correction']);
     /** Set the function to call when a correction is triggered. */
     setCorrectionFn(fn: () => void): void;
-    /** Start the correction orchestrator (begin forced interval timer). */
+    /** Start the correction orchestrator.
+     *  Fires a quick initial correction after 1s for early feedback, then
+     *  switches to the regular forcedInterval cadence from that point. */
     start(): void;
     /** Stop the orchestrator (clear all timers). */
     stop(): void;
@@ -220,8 +240,12 @@ declare class SpeechStreamingManager {
      * SpeechRecognition has claimed the microphone (onaudiostart) or after
      * a 300ms fallback — whichever comes first. The engine should await
      * this before calling getUserMedia to avoid dual-mic conflicts.
+     *
+     * When skipMicWait is true (warm restart — mic already active), returns
+     * immediately after calling recognition.start() without waiting for
+     * onaudiostart.
      */
-    start(language: string): Promise<void>;
+    start(language: string, skipMicWait?: boolean): Promise<void>;
     private clearNoResultTimer;
     /** Stop streaming recognition and return accumulated text. */
     stop(): string;
@@ -250,6 +274,8 @@ declare class STTEngine extends TypedEventEmitter<STTEvents> {
     private capture;
     private state;
     private workerUrl?;
+    /** Prevents performCorrection from emitting while stop() is consuming the in-flight result. */
+    private _stopping;
     /**
      * Create a new STT engine instance.
      * @param config - Optional configuration overrides (model, backend, language, etc.).
@@ -260,7 +286,8 @@ declare class STTEngine extends TypedEventEmitter<STTEvents> {
     init(): Promise<void>;
     /** Start recording audio and enable correction cycles. */
     start(): Promise<void>;
-    /** Stop recording, run final transcription, return text. */
+    /** Stop recording, run final transcription, return text.
+     *  Mic and AudioContext stay alive for fast restart — call destroy() to fully release. */
     stop(): Promise<string>;
     /** Destroy the engine: terminate worker, release all resources. */
     destroy(): void;
@@ -276,4 +303,4 @@ declare class STTEngine extends TypedEventEmitter<STTEvents> {
     private emitDebug;
 }
-export { type AudioCaptureHandle, CorrectionOrchestrator, DEFAULT_STT_CONFIG, type ResolvedSTTConfig, type STTBackend, type STTChunkingConfig, type STTConfig, type STTCorrectionConfig, type STTCorrectionProvider, STTEngine, type STTError, type STTEvents, type STTModelSize, type STTState, type STTStatus, type STTStreamingConfig, type STTStreamingProvider, SpeechStreamingManager, TypedEventEmitter, WorkerManager, type WorkerManagerEvents, resampleAudio, resolveConfig, snapshotAudio, startCapture, stopCapture };
+export { type AudioCaptureHandle, CorrectionOrchestrator, DEFAULT_STT_CONFIG, type ResolvedSTTConfig, type STTBackend, type STTChunkingConfig, type STTConfig, type STTCorrectionConfig, type STTCorrectionProvider, STTEngine, type STTError, type STTEvents, type STTModelSize, type STTState, type STTStatus, type STTStreamingConfig, type STTStreamingProvider, SpeechStreamingManager, TypedEventEmitter, WorkerManager, type WorkerManagerEvents, pauseCapture, resampleAudio, resolveConfig, resumeCapture, snapshotAudio, startCapture, stopCapture };

package/dist/index.d.ts CHANGED Viewed

@@ -14,9 +14,9 @@ interface STTCorrectionConfig {
     enabled?: boolean;
     /** Correction engine provider. Default: 'whisper' */
     provider?: STTCorrectionProvider;
-    /** Silence duration (ms) before triggering correction. Default: 3000 */
+    /** Silence duration (ms) before triggering correction. Default: 1000 */
     pauseThreshold?: number;
-    /** Maximum interval (ms) between forced corrections. Default: 5000 */
+    /** Maximum interval (ms) between forced corrections. Default: 3000 */
     forcedInterval?: number;
 }
 /** Real-time streaming preview configuration. */
@@ -95,6 +95,10 @@ interface AudioCaptureHandle {
     samples: Float32Array[];
     /** Retain reference to prevent GC from stopping audio processing. */
     _processor: ScriptProcessorNode;
+    /** Source node for disconnect/reconnect on pause/resume. */
+    _source: MediaStreamAudioSourceNode;
+    /** Gain node (silent) to prevent mic playback. */
+    _silencer: GainNode;
 }
 /** Default configuration values. */
 declare const DEFAULT_STT_CONFIG: ResolvedSTTConfig;
@@ -124,6 +128,18 @@ declare class TypedEventEmitter<T extends Record<string, (...args: any[]) => voi
  * Uses ScriptProcessorNode to collect Float32Array samples directly.
  */
 declare function startCapture(): Promise<AudioCaptureHandle>;
+/**
+ * Pause capture without releasing mic or AudioContext.
+ * Disconnects the audio source so no new samples are collected.
+ * Returns resampled audio from the recording period.
+ * Call resumeCapture() to start collecting again.
+ */
+declare function pauseCapture(capture: AudioCaptureHandle): Promise<Float32Array>;
+/**
+ * Resume a paused capture. Reconnects the audio source to the processor.
+ * AudioContext is resumed if suspended.
+ */
+declare function resumeCapture(capture: AudioCaptureHandle): Promise<void>;
 /**
  * Copy current audio buffer without stopping capture.
  * Returns a shallow copy of the samples array (each chunk is shared, not cloned).
@@ -152,8 +168,13 @@ type WorkerManagerEvents = {
 declare class WorkerManager extends TypedEventEmitter<WorkerManagerEvents> {
     private worker;
     private transcribeResolve;
+    private currentTranscribePromise;
     private modelReadyResolve;
     private modelReadyReject;
+    /** True while a transcription job is running in the worker. */
+    get isTranscribing(): boolean;
+    /** Await the current in-flight transcription without starting a new one. */
+    awaitCurrentTranscription(): Promise<string>;
     /** Spawn the Web Worker. Must be called before loadModel/transcribe. */
     spawn(workerUrl?: URL): void;
     /** Load the Whisper model in the worker. Resolves when ready. */
@@ -167,12 +188,9 @@ declare class WorkerManager extends TypedEventEmitter<WorkerManagerEvents> {
     private handleMessage;
 }
-/**
- * Manages mid-recording correction timing.
- * Two triggers: pause detection and forced interval.
- */
 declare class CorrectionOrchestrator {
     private forcedTimer;
+    private initialTimer;
     private lastCorrectionTime;
     private correctionFn;
     private config;
@@ -180,7 +198,9 @@ declare class CorrectionOrchestrator {
     constructor(config: ResolvedSTTConfig['correction']);
     /** Set the function to call when a correction is triggered. */
     setCorrectionFn(fn: () => void): void;
-    /** Start the correction orchestrator (begin forced interval timer). */
+    /** Start the correction orchestrator.
+     *  Fires a quick initial correction after 1s for early feedback, then
+     *  switches to the regular forcedInterval cadence from that point. */
     start(): void;
     /** Stop the orchestrator (clear all timers). */
     stop(): void;
@@ -220,8 +240,12 @@ declare class SpeechStreamingManager {
      * SpeechRecognition has claimed the microphone (onaudiostart) or after
      * a 300ms fallback — whichever comes first. The engine should await
      * this before calling getUserMedia to avoid dual-mic conflicts.
+     *
+     * When skipMicWait is true (warm restart — mic already active), returns
+     * immediately after calling recognition.start() without waiting for
+     * onaudiostart.
      */
-    start(language: string): Promise<void>;
+    start(language: string, skipMicWait?: boolean): Promise<void>;
     private clearNoResultTimer;
     /** Stop streaming recognition and return accumulated text. */
     stop(): string;
@@ -250,6 +274,8 @@ declare class STTEngine extends TypedEventEmitter<STTEvents> {
     private capture;
     private state;
     private workerUrl?;
+    /** Prevents performCorrection from emitting while stop() is consuming the in-flight result. */
+    private _stopping;
     /**
      * Create a new STT engine instance.
      * @param config - Optional configuration overrides (model, backend, language, etc.).
@@ -260,7 +286,8 @@ declare class STTEngine extends TypedEventEmitter<STTEvents> {
     init(): Promise<void>;
     /** Start recording audio and enable correction cycles. */
     start(): Promise<void>;
-    /** Stop recording, run final transcription, return text. */
+    /** Stop recording, run final transcription, return text.
+     *  Mic and AudioContext stay alive for fast restart — call destroy() to fully release. */
     stop(): Promise<string>;
     /** Destroy the engine: terminate worker, release all resources. */
     destroy(): void;
@@ -276,4 +303,4 @@ declare class STTEngine extends TypedEventEmitter<STTEvents> {
     private emitDebug;
 }
-export { type AudioCaptureHandle, CorrectionOrchestrator, DEFAULT_STT_CONFIG, type ResolvedSTTConfig, type STTBackend, type STTChunkingConfig, type STTConfig, type STTCorrectionConfig, type STTCorrectionProvider, STTEngine, type STTError, type STTEvents, type STTModelSize, type STTState, type STTStatus, type STTStreamingConfig, type STTStreamingProvider, SpeechStreamingManager, TypedEventEmitter, WorkerManager, type WorkerManagerEvents, resampleAudio, resolveConfig, snapshotAudio, startCapture, stopCapture };
+export { type AudioCaptureHandle, CorrectionOrchestrator, DEFAULT_STT_CONFIG, type ResolvedSTTConfig, type STTBackend, type STTChunkingConfig, type STTConfig, type STTCorrectionConfig, type STTCorrectionProvider, STTEngine, type STTError, type STTEvents, type STTModelSize, type STTState, type STTStatus, type STTStreamingConfig, type STTStreamingProvider, SpeechStreamingManager, TypedEventEmitter, WorkerManager, type WorkerManagerEvents, pauseCapture, resampleAudio, resolveConfig, resumeCapture, snapshotAudio, startCapture, stopCapture };

package/dist/index.js CHANGED Viewed

@@ -7,8 +7,8 @@ var DEFAULT_STT_CONFIG = {
   correction: {
     enabled: true,
     provider: "whisper",
-    pauseThreshold: 3e3,
-    forcedInterval: 5e3
+    pauseThreshold: 1e3,
+    forcedInterval: 3e3
   },
   chunking: {
     chunkLengthS: 30,
@@ -97,7 +97,19 @@ async function startCapture() {
   source.connect(processor);
   processor.connect(silencer);
   silencer.connect(audioCtx.destination);
-  return { audioCtx, stream, samples, _processor: processor };
+  return { audioCtx, stream, samples, _processor: processor, _source: source, _silencer: silencer };
+}
+async function pauseCapture(capture) {
+  capture._source.disconnect();
+  const currentSamples = [...capture.samples];
+  capture.samples.length = 0;
+  return resampleAudio(currentSamples, capture.audioCtx.sampleRate);
+}
+async function resumeCapture(capture) {
+  if (capture.audioCtx.state === "suspended") {
+    await capture.audioCtx.resume();
+  }
+  capture._source.connect(capture._processor);
 }
 function snapshotAudio(capture) {
   return [...capture.samples];
@@ -142,8 +154,17 @@ async function stopCapture(capture) {
 var WorkerManager = class extends TypedEventEmitter {
   worker = null;
   transcribeResolve = null;
+  currentTranscribePromise = null;
   modelReadyResolve = null;
   modelReadyReject = null;
+  /** True while a transcription job is running in the worker. */
+  get isTranscribing() {
+    return this.transcribeResolve !== null;
+  }
+  /** Await the current in-flight transcription without starting a new one. */
+  awaitCurrentTranscription() {
+    return this.currentTranscribePromise ?? Promise.resolve("");
+  }
   /** Spawn the Web Worker. Must be called before loadModel/transcribe. */
   spawn(workerUrl) {
     if (this.worker) return;
@@ -179,10 +200,11 @@ var WorkerManager = class extends TypedEventEmitter {
   async transcribe(audio) {
     if (!this.worker) throw new Error("Worker not spawned");
     if (audio.length === 0) return "";
-    return new Promise((resolve) => {
+    this.currentTranscribePromise = new Promise((resolve) => {
       this.transcribeResolve = resolve;
       this.worker.postMessage({ type: "transcribe", audio }, [audio.buffer]);
     });
+    return this.currentTranscribePromise;
   }
   /** Cancel any in-flight transcription. */
   cancel() {
@@ -234,8 +256,10 @@ var WorkerManager = class extends TypedEventEmitter {
 };
 // src/correction-orchestrator.ts
+var INITIAL_CORRECTION_DELAY_MS = 1e3;
 var CorrectionOrchestrator = class {
   forcedTimer = null;
+  initialTimer = null;
   lastCorrectionTime = 0;
   correctionFn = null;
   config;
@@ -247,14 +271,25 @@ var CorrectionOrchestrator = class {
   setCorrectionFn(fn) {
     this.correctionFn = fn;
   }
-  /** Start the correction orchestrator (begin forced interval timer). */
+  /** Start the correction orchestrator.
+   *  Fires a quick initial correction after 1s for early feedback, then
+   *  switches to the regular forcedInterval cadence from that point. */
   start() {
     if (!this.config.enabled) return;
     this.lastCorrectionTime = Date.now();
-    this.startForcedTimer();
+    this.initialTimer = setTimeout(() => {
+      this.initialTimer = null;
+      this.correctionFn?.();
+      this.lastCorrectionTime = Date.now();
+      this.startForcedTimer();
+    }, INITIAL_CORRECTION_DELAY_MS);
   }
   /** Stop the orchestrator (clear all timers). */
   stop() {
+    if (this.initialTimer) {
+      clearTimeout(this.initialTimer);
+      this.initialTimer = null;
+    }
     this.stopForcedTimer();
   }
   /** Called when a speech pause is detected. Triggers correction if cooldown elapsed. */
@@ -402,8 +437,12 @@ var SpeechStreamingManager = class {
    * SpeechRecognition has claimed the microphone (onaudiostart) or after
    * a 300ms fallback — whichever comes first. The engine should await
    * this before calling getUserMedia to avoid dual-mic conflicts.
+   *
+   * When skipMicWait is true (warm restart — mic already active), returns
+   * immediately after calling recognition.start() without waiting for
+   * onaudiostart.
    */
-  start(language) {
+  start(language, skipMicWait = false) {
     const SR = getSpeechRecognition();
     if (!SR) {
       this.log("[SSM] SpeechRecognition not available in this environment");
@@ -513,6 +552,10 @@ var SpeechStreamingManager = class {
       );
       return Promise.resolve();
     }
+    if (skipMicWait) {
+      this.log("[SSM] skipMicWait \u2014 warm restart, returning immediately");
+      return Promise.resolve();
+    }
     return micClaimPromise;
   }
   clearNoResultTimer() {
@@ -560,6 +603,8 @@ var STTEngine = class extends TypedEventEmitter {
   capture = null;
   state;
   workerUrl;
+  /** Prevents performCorrection from emitting while stop() is consuming the in-flight result. */
+  _stopping = false;
   /**
    * Create a new STT engine instance.
    * @param config - Optional configuration overrides (model, backend, language, etc.).
@@ -605,14 +650,22 @@ var STTEngine = class extends TypedEventEmitter {
       throw new Error(`Cannot start: engine is "${this.state.status}", expected "ready"`);
     }
     try {
+      const warmCapture = this.capture && this.capture.stream.getTracks().every((t) => t.readyState === "live");
       this.emitDebug(
-        `[STT] start() \u2014 streaming: ${this.config.streaming.enabled}, lang: "${this.config.language}"`
+        `[STT] start() \u2014 streaming: ${this.config.streaming.enabled}, lang: "${this.config.language}", warm: ${!!warmCapture}`
       );
       if (this.config.streaming.enabled) {
-        await this.speechStreaming.start(this.config.language);
-        this.emitDebug("[STT] Speech API mic claim complete \u2014 starting getUserMedia");
+        await this.speechStreaming.start(this.config.language, !!warmCapture);
+        if (!warmCapture) {
+          this.emitDebug("[STT] Speech API mic claim complete \u2014 starting getUserMedia");
+        }
+      }
+      if (warmCapture) {
+        await resumeCapture(this.capture);
+        this.emitDebug("[STT] warm mic resumed \u2014 skipped getUserMedia");
+      } else {
+        this.capture = await startCapture();
       }
-      this.capture = await startCapture();
       this.updateStatus("recording");
       this.correctionOrchestrator.start();
     } catch (err) {
@@ -622,16 +675,49 @@ var STTEngine = class extends TypedEventEmitter {
       );
     }
   }
-  /** Stop recording, run final transcription, return text. */
+  /** Stop recording, run final transcription, return text.
+   *  Mic and AudioContext stay alive for fast restart — call destroy() to fully release. */
   async stop() {
     if (!this.capture) return "";
+    this._stopping = true;
     this.correctionOrchestrator.stop();
     this.speechStreaming.stop();
-    this.workerManager.cancel();
     this.updateStatus("processing");
+    if (this.workerManager.isTranscribing) {
+      try {
+        const [audio, inFlightText] = await Promise.all([
+          pauseCapture(this.capture),
+          this.workerManager.awaitCurrentTranscription()
+        ]);
+        this._stopping = false;
+        const text = inFlightText.trim();
+        if (text) {
+          this.emit("correction", text);
+          this.updateStatus("ready");
+          return text;
+        }
+        if (audio.length > 0) {
+          const freshText = await this.workerManager.transcribe(audio);
+          this.emit("correction", freshText);
+          this.updateStatus("ready");
+          return freshText;
+        }
+        this.updateStatus("ready");
+        return "";
+      } catch (err) {
+        this._stopping = false;
+        this.emitError(
+          "TRANSCRIPTION_FAILED",
+          err instanceof Error ? err.message : "Final transcription failed."
+        );
+        this.updateStatus("ready");
+        return "";
+      }
+    }
+    this.workerManager.cancel();
+    this._stopping = false;
     try {
-      const audio = await stopCapture(this.capture);
-      this.capture = null;
+      const audio = await pauseCapture(this.capture);
       if (audio.length === 0) {
         this.updateStatus("ready");
         return "";
@@ -654,6 +740,10 @@ var STTEngine = class extends TypedEventEmitter {
     this.correctionOrchestrator.stop();
     this.speechStreaming.destroy();
     if (this.capture) {
+      try {
+        this.capture._processor.disconnect();
+      } catch {
+      }
       for (const track of this.capture.stream.getTracks()) {
         track.stop();
       }
@@ -682,7 +772,7 @@ var STTEngine = class extends TypedEventEmitter {
       const audio = await resampleAudio(samples, nativeSr);
       if (audio.length === 0) return;
       const text = await this.workerManager.transcribe(audio);
-      if (text.trim() && this.capture) {
+      if (text.trim() && this.capture && !this._stopping) {
         this.emit("correction", text);
       }
     } catch (err) {
@@ -739,8 +829,10 @@ export {
   SpeechStreamingManager,
   TypedEventEmitter,
   WorkerManager,
+  pauseCapture,
   resampleAudio,
   resolveConfig,
+  resumeCapture,
   snapshotAudio,
   startCapture,
   stopCapture