@tekyzinc/stt-component 0.2.4 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.cts CHANGED
@@ -14,9 +14,9 @@ interface STTCorrectionConfig {
14
14
  enabled?: boolean;
15
15
  /** Correction engine provider. Default: 'whisper' */
16
16
  provider?: STTCorrectionProvider;
17
- /** Silence duration (ms) before triggering correction. Default: 3000 */
17
+ /** Silence duration (ms) before triggering correction. Default: 1000 */
18
18
  pauseThreshold?: number;
19
- /** Maximum interval (ms) between forced corrections. Default: 5000 */
19
+ /** Maximum interval (ms) between forced corrections. Default: 3000 */
20
20
  forcedInterval?: number;
21
21
  }
22
22
  /** Real-time streaming preview configuration. */
@@ -95,6 +95,10 @@ interface AudioCaptureHandle {
95
95
  samples: Float32Array[];
96
96
  /** Retain reference to prevent GC from stopping audio processing. */
97
97
  _processor: ScriptProcessorNode;
98
+ /** Source node for disconnect/reconnect on pause/resume. */
99
+ _source: MediaStreamAudioSourceNode;
100
+ /** Gain node (silent) to prevent mic playback. */
101
+ _silencer: GainNode;
98
102
  }
99
103
  /** Default configuration values. */
100
104
  declare const DEFAULT_STT_CONFIG: ResolvedSTTConfig;
@@ -124,6 +128,18 @@ declare class TypedEventEmitter<T extends Record<string, (...args: any[]) => voi
124
128
  * Uses ScriptProcessorNode to collect Float32Array samples directly.
125
129
  */
126
130
  declare function startCapture(): Promise<AudioCaptureHandle>;
131
+ /**
132
+ * Pause capture without releasing mic or AudioContext.
133
+ * Disconnects the audio source so no new samples are collected.
134
+ * Returns resampled audio from the recording period.
135
+ * Call resumeCapture() to start collecting again.
136
+ */
137
+ declare function pauseCapture(capture: AudioCaptureHandle): Promise<Float32Array>;
138
+ /**
139
+ * Resume a paused capture. Reconnects the audio source to the processor.
140
+ * AudioContext is resumed if suspended.
141
+ */
142
+ declare function resumeCapture(capture: AudioCaptureHandle): Promise<void>;
127
143
  /**
128
144
  * Copy current audio buffer without stopping capture.
129
145
  * Returns a shallow copy of the samples array (each chunk is shared, not cloned).
@@ -152,8 +168,13 @@ type WorkerManagerEvents = {
152
168
  declare class WorkerManager extends TypedEventEmitter<WorkerManagerEvents> {
153
169
  private worker;
154
170
  private transcribeResolve;
171
+ private currentTranscribePromise;
155
172
  private modelReadyResolve;
156
173
  private modelReadyReject;
174
+ /** True while a transcription job is running in the worker. */
175
+ get isTranscribing(): boolean;
176
+ /** Await the current in-flight transcription without starting a new one. */
177
+ awaitCurrentTranscription(): Promise<string>;
157
178
  /** Spawn the Web Worker. Must be called before loadModel/transcribe. */
158
179
  spawn(workerUrl?: URL): void;
159
180
  /** Load the Whisper model in the worker. Resolves when ready. */
@@ -220,8 +241,12 @@ declare class SpeechStreamingManager {
220
241
  * SpeechRecognition has claimed the microphone (onaudiostart) or after
221
242
  * a 300ms fallback — whichever comes first. The engine should await
222
243
  * this before calling getUserMedia to avoid dual-mic conflicts.
244
+ *
245
+ * When skipMicWait is true (warm restart — mic already active), returns
246
+ * immediately after calling recognition.start() without waiting for
247
+ * onaudiostart.
223
248
  */
224
- start(language: string): Promise<void>;
249
+ start(language: string, skipMicWait?: boolean): Promise<void>;
225
250
  private clearNoResultTimer;
226
251
  /** Stop streaming recognition and return accumulated text. */
227
252
  stop(): string;
@@ -250,6 +275,8 @@ declare class STTEngine extends TypedEventEmitter<STTEvents> {
250
275
  private capture;
251
276
  private state;
252
277
  private workerUrl?;
278
+ /** Prevents performCorrection from emitting while stop() is consuming the in-flight result. */
279
+ private _stopping;
253
280
  /**
254
281
  * Create a new STT engine instance.
255
282
  * @param config - Optional configuration overrides (model, backend, language, etc.).
@@ -260,7 +287,8 @@ declare class STTEngine extends TypedEventEmitter<STTEvents> {
260
287
  init(): Promise<void>;
261
288
  /** Start recording audio and enable correction cycles. */
262
289
  start(): Promise<void>;
263
- /** Stop recording, run final transcription, return text. */
290
+ /** Stop recording, run final transcription, return text.
291
+ * Mic and AudioContext stay alive for fast restart — call destroy() to fully release. */
264
292
  stop(): Promise<string>;
265
293
  /** Destroy the engine: terminate worker, release all resources. */
266
294
  destroy(): void;
@@ -276,4 +304,4 @@ declare class STTEngine extends TypedEventEmitter<STTEvents> {
276
304
  private emitDebug;
277
305
  }
278
306
 
279
- export { type AudioCaptureHandle, CorrectionOrchestrator, DEFAULT_STT_CONFIG, type ResolvedSTTConfig, type STTBackend, type STTChunkingConfig, type STTConfig, type STTCorrectionConfig, type STTCorrectionProvider, STTEngine, type STTError, type STTEvents, type STTModelSize, type STTState, type STTStatus, type STTStreamingConfig, type STTStreamingProvider, SpeechStreamingManager, TypedEventEmitter, WorkerManager, type WorkerManagerEvents, resampleAudio, resolveConfig, snapshotAudio, startCapture, stopCapture };
307
+ export { type AudioCaptureHandle, CorrectionOrchestrator, DEFAULT_STT_CONFIG, type ResolvedSTTConfig, type STTBackend, type STTChunkingConfig, type STTConfig, type STTCorrectionConfig, type STTCorrectionProvider, STTEngine, type STTError, type STTEvents, type STTModelSize, type STTState, type STTStatus, type STTStreamingConfig, type STTStreamingProvider, SpeechStreamingManager, TypedEventEmitter, WorkerManager, type WorkerManagerEvents, pauseCapture, resampleAudio, resolveConfig, resumeCapture, snapshotAudio, startCapture, stopCapture };
package/dist/index.d.ts CHANGED
@@ -14,9 +14,9 @@ interface STTCorrectionConfig {
14
14
  enabled?: boolean;
15
15
  /** Correction engine provider. Default: 'whisper' */
16
16
  provider?: STTCorrectionProvider;
17
- /** Silence duration (ms) before triggering correction. Default: 3000 */
17
+ /** Silence duration (ms) before triggering correction. Default: 1000 */
18
18
  pauseThreshold?: number;
19
- /** Maximum interval (ms) between forced corrections. Default: 5000 */
19
+ /** Maximum interval (ms) between forced corrections. Default: 3000 */
20
20
  forcedInterval?: number;
21
21
  }
22
22
  /** Real-time streaming preview configuration. */
@@ -95,6 +95,10 @@ interface AudioCaptureHandle {
95
95
  samples: Float32Array[];
96
96
  /** Retain reference to prevent GC from stopping audio processing. */
97
97
  _processor: ScriptProcessorNode;
98
+ /** Source node for disconnect/reconnect on pause/resume. */
99
+ _source: MediaStreamAudioSourceNode;
100
+ /** Gain node (silent) to prevent mic playback. */
101
+ _silencer: GainNode;
98
102
  }
99
103
  /** Default configuration values. */
100
104
  declare const DEFAULT_STT_CONFIG: ResolvedSTTConfig;
@@ -124,6 +128,18 @@ declare class TypedEventEmitter<T extends Record<string, (...args: any[]) => voi
124
128
  * Uses ScriptProcessorNode to collect Float32Array samples directly.
125
129
  */
126
130
  declare function startCapture(): Promise<AudioCaptureHandle>;
131
+ /**
132
+ * Pause capture without releasing mic or AudioContext.
133
+ * Disconnects the audio source so no new samples are collected.
134
+ * Returns resampled audio from the recording period.
135
+ * Call resumeCapture() to start collecting again.
136
+ */
137
+ declare function pauseCapture(capture: AudioCaptureHandle): Promise<Float32Array>;
138
+ /**
139
+ * Resume a paused capture. Reconnects the audio source to the processor.
140
+ * AudioContext is resumed if suspended.
141
+ */
142
+ declare function resumeCapture(capture: AudioCaptureHandle): Promise<void>;
127
143
  /**
128
144
  * Copy current audio buffer without stopping capture.
129
145
  * Returns a shallow copy of the samples array (each chunk is shared, not cloned).
@@ -152,8 +168,13 @@ type WorkerManagerEvents = {
152
168
  declare class WorkerManager extends TypedEventEmitter<WorkerManagerEvents> {
153
169
  private worker;
154
170
  private transcribeResolve;
171
+ private currentTranscribePromise;
155
172
  private modelReadyResolve;
156
173
  private modelReadyReject;
174
+ /** True while a transcription job is running in the worker. */
175
+ get isTranscribing(): boolean;
176
+ /** Await the current in-flight transcription without starting a new one. */
177
+ awaitCurrentTranscription(): Promise<string>;
157
178
  /** Spawn the Web Worker. Must be called before loadModel/transcribe. */
158
179
  spawn(workerUrl?: URL): void;
159
180
  /** Load the Whisper model in the worker. Resolves when ready. */
@@ -220,8 +241,12 @@ declare class SpeechStreamingManager {
220
241
  * SpeechRecognition has claimed the microphone (onaudiostart) or after
221
242
  * a 300ms fallback — whichever comes first. The engine should await
222
243
  * this before calling getUserMedia to avoid dual-mic conflicts.
244
+ *
245
+ * When skipMicWait is true (warm restart — mic already active), returns
246
+ * immediately after calling recognition.start() without waiting for
247
+ * onaudiostart.
223
248
  */
224
- start(language: string): Promise<void>;
249
+ start(language: string, skipMicWait?: boolean): Promise<void>;
225
250
  private clearNoResultTimer;
226
251
  /** Stop streaming recognition and return accumulated text. */
227
252
  stop(): string;
@@ -250,6 +275,8 @@ declare class STTEngine extends TypedEventEmitter<STTEvents> {
250
275
  private capture;
251
276
  private state;
252
277
  private workerUrl?;
278
+ /** Prevents performCorrection from emitting while stop() is consuming the in-flight result. */
279
+ private _stopping;
253
280
  /**
254
281
  * Create a new STT engine instance.
255
282
  * @param config - Optional configuration overrides (model, backend, language, etc.).
@@ -260,7 +287,8 @@ declare class STTEngine extends TypedEventEmitter<STTEvents> {
260
287
  init(): Promise<void>;
261
288
  /** Start recording audio and enable correction cycles. */
262
289
  start(): Promise<void>;
263
- /** Stop recording, run final transcription, return text. */
290
+ /** Stop recording, run final transcription, return text.
291
+ * Mic and AudioContext stay alive for fast restart — call destroy() to fully release. */
264
292
  stop(): Promise<string>;
265
293
  /** Destroy the engine: terminate worker, release all resources. */
266
294
  destroy(): void;
@@ -276,4 +304,4 @@ declare class STTEngine extends TypedEventEmitter<STTEvents> {
276
304
  private emitDebug;
277
305
  }
278
306
 
279
- export { type AudioCaptureHandle, CorrectionOrchestrator, DEFAULT_STT_CONFIG, type ResolvedSTTConfig, type STTBackend, type STTChunkingConfig, type STTConfig, type STTCorrectionConfig, type STTCorrectionProvider, STTEngine, type STTError, type STTEvents, type STTModelSize, type STTState, type STTStatus, type STTStreamingConfig, type STTStreamingProvider, SpeechStreamingManager, TypedEventEmitter, WorkerManager, type WorkerManagerEvents, resampleAudio, resolveConfig, snapshotAudio, startCapture, stopCapture };
307
+ export { type AudioCaptureHandle, CorrectionOrchestrator, DEFAULT_STT_CONFIG, type ResolvedSTTConfig, type STTBackend, type STTChunkingConfig, type STTConfig, type STTCorrectionConfig, type STTCorrectionProvider, STTEngine, type STTError, type STTEvents, type STTModelSize, type STTState, type STTStatus, type STTStreamingConfig, type STTStreamingProvider, SpeechStreamingManager, TypedEventEmitter, WorkerManager, type WorkerManagerEvents, pauseCapture, resampleAudio, resolveConfig, resumeCapture, snapshotAudio, startCapture, stopCapture };
package/dist/index.js CHANGED
@@ -7,8 +7,8 @@ var DEFAULT_STT_CONFIG = {
7
7
  correction: {
8
8
  enabled: true,
9
9
  provider: "whisper",
10
- pauseThreshold: 3e3,
11
- forcedInterval: 5e3
10
+ pauseThreshold: 1e3,
11
+ forcedInterval: 3e3
12
12
  },
13
13
  chunking: {
14
14
  chunkLengthS: 30,
@@ -97,7 +97,19 @@ async function startCapture() {
97
97
  source.connect(processor);
98
98
  processor.connect(silencer);
99
99
  silencer.connect(audioCtx.destination);
100
- return { audioCtx, stream, samples, _processor: processor };
100
+ return { audioCtx, stream, samples, _processor: processor, _source: source, _silencer: silencer };
101
+ }
102
+ async function pauseCapture(capture) {
103
+ capture._source.disconnect();
104
+ const currentSamples = [...capture.samples];
105
+ capture.samples.length = 0;
106
+ return resampleAudio(currentSamples, capture.audioCtx.sampleRate);
107
+ }
108
+ async function resumeCapture(capture) {
109
+ if (capture.audioCtx.state === "suspended") {
110
+ await capture.audioCtx.resume();
111
+ }
112
+ capture._source.connect(capture._processor);
101
113
  }
102
114
  function snapshotAudio(capture) {
103
115
  return [...capture.samples];
@@ -142,8 +154,17 @@ async function stopCapture(capture) {
142
154
  var WorkerManager = class extends TypedEventEmitter {
143
155
  worker = null;
144
156
  transcribeResolve = null;
157
+ currentTranscribePromise = null;
145
158
  modelReadyResolve = null;
146
159
  modelReadyReject = null;
160
+ /** True while a transcription job is running in the worker. */
161
+ get isTranscribing() {
162
+ return this.transcribeResolve !== null;
163
+ }
164
+ /** Await the current in-flight transcription without starting a new one. */
165
+ awaitCurrentTranscription() {
166
+ return this.currentTranscribePromise ?? Promise.resolve("");
167
+ }
147
168
  /** Spawn the Web Worker. Must be called before loadModel/transcribe. */
148
169
  spawn(workerUrl) {
149
170
  if (this.worker) return;
@@ -179,10 +200,11 @@ var WorkerManager = class extends TypedEventEmitter {
179
200
  async transcribe(audio) {
180
201
  if (!this.worker) throw new Error("Worker not spawned");
181
202
  if (audio.length === 0) return "";
182
- return new Promise((resolve) => {
203
+ this.currentTranscribePromise = new Promise((resolve) => {
183
204
  this.transcribeResolve = resolve;
184
205
  this.worker.postMessage({ type: "transcribe", audio }, [audio.buffer]);
185
206
  });
207
+ return this.currentTranscribePromise;
186
208
  }
187
209
  /** Cancel any in-flight transcription. */
188
210
  cancel() {
@@ -402,8 +424,12 @@ var SpeechStreamingManager = class {
402
424
  * SpeechRecognition has claimed the microphone (onaudiostart) or after
403
425
  * a 300ms fallback — whichever comes first. The engine should await
404
426
  * this before calling getUserMedia to avoid dual-mic conflicts.
427
+ *
428
+ * When skipMicWait is true (warm restart — mic already active), returns
429
+ * immediately after calling recognition.start() without waiting for
430
+ * onaudiostart.
405
431
  */
406
- start(language) {
432
+ start(language, skipMicWait = false) {
407
433
  const SR = getSpeechRecognition();
408
434
  if (!SR) {
409
435
  this.log("[SSM] SpeechRecognition not available in this environment");
@@ -513,6 +539,10 @@ var SpeechStreamingManager = class {
513
539
  );
514
540
  return Promise.resolve();
515
541
  }
542
+ if (skipMicWait) {
543
+ this.log("[SSM] skipMicWait \u2014 warm restart, returning immediately");
544
+ return Promise.resolve();
545
+ }
516
546
  return micClaimPromise;
517
547
  }
518
548
  clearNoResultTimer() {
@@ -560,6 +590,8 @@ var STTEngine = class extends TypedEventEmitter {
560
590
  capture = null;
561
591
  state;
562
592
  workerUrl;
593
+ /** Prevents performCorrection from emitting while stop() is consuming the in-flight result. */
594
+ _stopping = false;
563
595
  /**
564
596
  * Create a new STT engine instance.
565
597
  * @param config - Optional configuration overrides (model, backend, language, etc.).
@@ -605,14 +637,22 @@ var STTEngine = class extends TypedEventEmitter {
605
637
  throw new Error(`Cannot start: engine is "${this.state.status}", expected "ready"`);
606
638
  }
607
639
  try {
640
+ const warmCapture = this.capture && this.capture.stream.getTracks().every((t) => t.readyState === "live");
608
641
  this.emitDebug(
609
- `[STT] start() \u2014 streaming: ${this.config.streaming.enabled}, lang: "${this.config.language}"`
642
+ `[STT] start() \u2014 streaming: ${this.config.streaming.enabled}, lang: "${this.config.language}", warm: ${!!warmCapture}`
610
643
  );
611
644
  if (this.config.streaming.enabled) {
612
- await this.speechStreaming.start(this.config.language);
613
- this.emitDebug("[STT] Speech API mic claim complete \u2014 starting getUserMedia");
645
+ await this.speechStreaming.start(this.config.language, !!warmCapture);
646
+ if (!warmCapture) {
647
+ this.emitDebug("[STT] Speech API mic claim complete \u2014 starting getUserMedia");
648
+ }
649
+ }
650
+ if (warmCapture) {
651
+ await resumeCapture(this.capture);
652
+ this.emitDebug("[STT] warm mic resumed \u2014 skipped getUserMedia");
653
+ } else {
654
+ this.capture = await startCapture();
614
655
  }
615
- this.capture = await startCapture();
616
656
  this.updateStatus("recording");
617
657
  this.correctionOrchestrator.start();
618
658
  } catch (err) {
@@ -622,16 +662,49 @@ var STTEngine = class extends TypedEventEmitter {
622
662
  );
623
663
  }
624
664
  }
625
- /** Stop recording, run final transcription, return text. */
665
+ /** Stop recording, run final transcription, return text.
666
+ * Mic and AudioContext stay alive for fast restart — call destroy() to fully release. */
626
667
  async stop() {
627
668
  if (!this.capture) return "";
669
+ this._stopping = true;
628
670
  this.correctionOrchestrator.stop();
629
671
  this.speechStreaming.stop();
630
- this.workerManager.cancel();
631
672
  this.updateStatus("processing");
673
+ if (this.workerManager.isTranscribing) {
674
+ try {
675
+ const [audio, inFlightText] = await Promise.all([
676
+ pauseCapture(this.capture),
677
+ this.workerManager.awaitCurrentTranscription()
678
+ ]);
679
+ this._stopping = false;
680
+ const text = inFlightText.trim();
681
+ if (text) {
682
+ this.emit("correction", text);
683
+ this.updateStatus("ready");
684
+ return text;
685
+ }
686
+ if (audio.length > 0) {
687
+ const freshText = await this.workerManager.transcribe(audio);
688
+ this.emit("correction", freshText);
689
+ this.updateStatus("ready");
690
+ return freshText;
691
+ }
692
+ this.updateStatus("ready");
693
+ return "";
694
+ } catch (err) {
695
+ this._stopping = false;
696
+ this.emitError(
697
+ "TRANSCRIPTION_FAILED",
698
+ err instanceof Error ? err.message : "Final transcription failed."
699
+ );
700
+ this.updateStatus("ready");
701
+ return "";
702
+ }
703
+ }
704
+ this.workerManager.cancel();
705
+ this._stopping = false;
632
706
  try {
633
- const audio = await stopCapture(this.capture);
634
- this.capture = null;
707
+ const audio = await pauseCapture(this.capture);
635
708
  if (audio.length === 0) {
636
709
  this.updateStatus("ready");
637
710
  return "";
@@ -654,6 +727,10 @@ var STTEngine = class extends TypedEventEmitter {
654
727
  this.correctionOrchestrator.stop();
655
728
  this.speechStreaming.destroy();
656
729
  if (this.capture) {
730
+ try {
731
+ this.capture._processor.disconnect();
732
+ } catch {
733
+ }
657
734
  for (const track of this.capture.stream.getTracks()) {
658
735
  track.stop();
659
736
  }
@@ -682,7 +759,7 @@ var STTEngine = class extends TypedEventEmitter {
682
759
  const audio = await resampleAudio(samples, nativeSr);
683
760
  if (audio.length === 0) return;
684
761
  const text = await this.workerManager.transcribe(audio);
685
- if (text.trim() && this.capture) {
762
+ if (text.trim() && this.capture && !this._stopping) {
686
763
  this.emit("correction", text);
687
764
  }
688
765
  } catch (err) {
@@ -739,8 +816,10 @@ export {
739
816
  SpeechStreamingManager,
740
817
  TypedEventEmitter,
741
818
  WorkerManager,
819
+ pauseCapture,
742
820
  resampleAudio,
743
821
  resolveConfig,
822
+ resumeCapture,
744
823
  snapshotAudio,
745
824
  startCapture,
746
825
  stopCapture