@tekyzinc/stt-component 0.3.1 → 0.3.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.cts CHANGED
@@ -218,8 +218,13 @@ declare class SpeechStreamingManager {
218
218
  private recognition;
219
219
  private accumulated;
220
220
  private active;
221
+ private keepingWarm;
222
+ private currentLang;
221
223
  private receivedResult;
224
+ private lastFinalIndex;
225
+ private lastFinalText;
222
226
  private noResultTimer;
227
+ private idleTimer;
223
228
  private onTranscript;
224
229
  private onPause;
225
230
  private onError;
@@ -236,20 +241,29 @@ declare class SpeechStreamingManager {
236
241
  setOnDebug(fn: (message: string) => void): void;
237
242
  private log;
238
243
  /**
239
- * Start streaming recognition. Returns a Promise that resolves once
240
- * SpeechRecognition has claimed the microphone (onaudiostart) or after
241
- * a 300ms fallback whichever comes first. The engine should await
242
- * this before calling getUserMedia to avoid dual-mic conflicts.
243
- *
244
- * When skipMicWait is true (warm restart — mic already active), returns
245
- * immediately after calling recognition.start() without waiting for
246
- * onaudiostart.
244
+ * Pre-warm: start recognition in muted mode so it's ready before the user
245
+ * clicks. Call after engine.init() completes. Eliminates startup latency on
246
+ * first click by keeping the Google Speech session alive.
247
+ */
248
+ preWarm(language: string): void;
249
+ /**
250
+ * Start streaming recognition. If recognition is already warm (session
251
+ * running from preWarm or a previous session within the idle window),
252
+ * activates instantly — no Google handshake. Otherwise cold-starts.
247
253
  */
248
254
  start(language: string, skipMicWait?: boolean): Promise<void>;
255
+ /**
256
+ * Create and start a new SpeechRecognition instance.
257
+ * Used by both preWarm() (active=false) and start() cold path (active=true).
258
+ */
259
+ private spawnRecognition;
249
260
  private clearNoResultTimer;
250
- /** Stop streaming recognition and return accumulated text. */
261
+ private clearIdleTimer;
262
+ private startIdleTimer;
263
+ /** Stop streaming recognition and return accumulated text.
264
+ * Keeps the recognition session alive (muted) for instant restart. */
251
265
  stop(): string;
252
- /** Abort immediately without returning text. */
266
+ /** Abort immediately and release all resources. */
253
267
  destroy(): void;
254
268
  }
255
269
 
package/dist/index.d.ts CHANGED
@@ -218,8 +218,13 @@ declare class SpeechStreamingManager {
218
218
  private recognition;
219
219
  private accumulated;
220
220
  private active;
221
+ private keepingWarm;
222
+ private currentLang;
221
223
  private receivedResult;
224
+ private lastFinalIndex;
225
+ private lastFinalText;
222
226
  private noResultTimer;
227
+ private idleTimer;
223
228
  private onTranscript;
224
229
  private onPause;
225
230
  private onError;
@@ -236,20 +241,29 @@ declare class SpeechStreamingManager {
236
241
  setOnDebug(fn: (message: string) => void): void;
237
242
  private log;
238
243
  /**
239
- * Start streaming recognition. Returns a Promise that resolves once
240
- * SpeechRecognition has claimed the microphone (onaudiostart) or after
241
- * a 300ms fallback whichever comes first. The engine should await
242
- * this before calling getUserMedia to avoid dual-mic conflicts.
243
- *
244
- * When skipMicWait is true (warm restart — mic already active), returns
245
- * immediately after calling recognition.start() without waiting for
246
- * onaudiostart.
244
+ * Pre-warm: start recognition in muted mode so it's ready before the user
245
+ * clicks. Call after engine.init() completes. Eliminates startup latency on
246
+ * first click by keeping the Google Speech session alive.
247
+ */
248
+ preWarm(language: string): void;
249
+ /**
250
+ * Start streaming recognition. If recognition is already warm (session
251
+ * running from preWarm or a previous session within the idle window),
252
+ * activates instantly — no Google handshake. Otherwise cold-starts.
247
253
  */
248
254
  start(language: string, skipMicWait?: boolean): Promise<void>;
255
+ /**
256
+ * Create and start a new SpeechRecognition instance.
257
+ * Used by both preWarm() (active=false) and start() cold path (active=true).
258
+ */
259
+ private spawnRecognition;
249
260
  private clearNoResultTimer;
250
- /** Stop streaming recognition and return accumulated text. */
261
+ private clearIdleTimer;
262
+ private startIdleTimer;
263
+ /** Stop streaming recognition and return accumulated text.
264
+ * Keeps the recognition session alive (muted) for instant restart. */
251
265
  stop(): string;
252
- /** Abort immediately without returning text. */
266
+ /** Abort immediately and release all resources. */
253
267
  destroy(): void;
254
268
  }
255
269
 
package/dist/index.js CHANGED
@@ -114,6 +114,13 @@ async function resumeCapture(capture) {
114
114
  function snapshotAudio(capture) {
115
115
  return [...capture.samples];
116
116
  }
117
+ function trimAudioBuffer(capture, keepSeconds) {
118
+ const samplesPerChunk = 4096;
119
+ const chunksToKeep = Math.ceil(keepSeconds * capture.audioCtx.sampleRate / samplesPerChunk);
120
+ if (capture.samples.length > chunksToKeep) {
121
+ capture.samples.splice(0, capture.samples.length - chunksToKeep);
122
+ }
123
+ }
117
124
  async function resampleAudio(samples, nativeSr) {
118
125
  const totalLength = samples.reduce((sum, s) => sum + s.length, 0);
119
126
  if (totalLength === 0) return new Float32Array(0);
@@ -398,12 +405,22 @@ function toBCP47(language) {
398
405
  return WHISPER_TO_BCP47[language.toLowerCase()] ?? language;
399
406
  }
400
407
  var NO_RESULT_TIMEOUT_MS = 5e3;
408
+ var IDLE_TIMEOUT_MS = 3e4;
401
409
  var SpeechStreamingManager = class {
402
410
  recognition = null;
403
411
  accumulated = "";
404
412
  active = false;
413
+ // user is recording — emit results, trigger onPause on end
414
+ keepingWarm = false;
415
+ // recognition running silently between user sessions
416
+ currentLang = "";
417
+ // BCP-47 of running recognition (for fast-path check)
405
418
  receivedResult = false;
419
+ lastFinalIndex = -1;
420
+ // instance fields so warm-restart resets them cleanly
421
+ lastFinalText = "";
406
422
  noResultTimer = null;
423
+ idleTimer = null;
407
424
  onTranscript = null;
408
425
  onPause = null;
409
426
  onError = null;
@@ -433,14 +450,25 @@ var SpeechStreamingManager = class {
433
450
  console.warn(message);
434
451
  }
435
452
  /**
436
- * Start streaming recognition. Returns a Promise that resolves once
437
- * SpeechRecognition has claimed the microphone (onaudiostart) or after
438
- * a 300ms fallback whichever comes first. The engine should await
439
- * this before calling getUserMedia to avoid dual-mic conflicts.
440
- *
441
- * When skipMicWait is true (warm restart — mic already active), returns
442
- * immediately after calling recognition.start() without waiting for
443
- * onaudiostart.
453
+ * Pre-warm: start recognition in muted mode so it's ready before the user
454
+ * clicks. Call after engine.init() completes. Eliminates startup latency on
455
+ * first click by keeping the Google Speech session alive.
456
+ */
457
+ preWarm(language) {
458
+ const SR = getSpeechRecognition();
459
+ if (!SR) return;
460
+ const bcp47 = toBCP47(language);
461
+ if (this.recognition && this.currentLang === bcp47) return;
462
+ this.log(`[SSM] preWarm() \u2014 lang: "${language}" \u2192 "${bcp47}"`);
463
+ this.keepingWarm = true;
464
+ this.active = false;
465
+ this.clearIdleTimer();
466
+ this.spawnRecognition(language);
467
+ }
468
+ /**
469
+ * Start streaming recognition. If recognition is already warm (session
470
+ * running from preWarm or a previous session within the idle window),
471
+ * activates instantly — no Google handshake. Otherwise cold-starts.
444
472
  */
445
473
  start(language, skipMicWait = false) {
446
474
  const SR = getSpeechRecognition();
@@ -451,14 +479,49 @@ var SpeechStreamingManager = class {
451
479
  const bcp47 = toBCP47(language);
452
480
  this.log(`[SSM] start() \u2014 lang: "${language}" \u2192 "${bcp47}"`);
453
481
  this.accumulated = "";
454
- this.active = true;
455
482
  this.receivedResult = false;
483
+ this.lastFinalIndex = -1;
484
+ this.lastFinalText = "";
485
+ this.clearIdleTimer();
486
+ if (this.recognition && this.currentLang === bcp47) {
487
+ this.log("[SSM] start() \u2014 warm session, activating immediately");
488
+ this.keepingWarm = false;
489
+ this.active = true;
490
+ this.clearNoResultTimer();
491
+ this.noResultTimer = setTimeout(() => {
492
+ if (this.active && !this.receivedResult) {
493
+ this.log("[SSM] no-result timeout fired \u2014 no onresult in 5s");
494
+ this.onError?.(
495
+ "Speech streaming started but received no results. Mic may be blocked by another audio capture."
496
+ );
497
+ }
498
+ }, NO_RESULT_TIMEOUT_MS);
499
+ return Promise.resolve();
500
+ }
501
+ this.keepingWarm = false;
502
+ this.active = true;
503
+ return this.spawnRecognition(language, skipMicWait);
504
+ }
505
+ /**
506
+ * Create and start a new SpeechRecognition instance.
507
+ * Used by both preWarm() (active=false) and start() cold path (active=true).
508
+ */
509
+ spawnRecognition(language, skipMicWait = false) {
510
+ const SR = getSpeechRecognition();
511
+ const bcp47 = toBCP47(language);
512
+ if (this.recognition) {
513
+ const old = this.recognition;
514
+ this.recognition = null;
515
+ try {
516
+ old.stop();
517
+ } catch {
518
+ }
519
+ }
520
+ this.currentLang = bcp47;
456
521
  const recognition = new SR();
457
522
  recognition.continuous = true;
458
523
  recognition.interimResults = true;
459
524
  recognition.lang = bcp47;
460
- let lastFinalIndex = -1;
461
- let lastFinalText = "";
462
525
  let micReady = false;
463
526
  const micClaimPromise = new Promise((resolve) => {
464
527
  recognition.onaudiostart = () => {
@@ -478,26 +541,29 @@ var SpeechStreamingManager = class {
478
541
  }, 300);
479
542
  });
480
543
  this.clearNoResultTimer();
481
- this.noResultTimer = setTimeout(() => {
482
- if (this.active && !this.receivedResult) {
483
- this.log("[SSM] no-result timeout fired \u2014 no onresult in 5s");
484
- this.onError?.(
485
- "Speech streaming started but received no results. Mic may be blocked by another audio capture."
486
- );
487
- }
488
- }, NO_RESULT_TIMEOUT_MS);
544
+ if (this.active) {
545
+ this.noResultTimer = setTimeout(() => {
546
+ if (this.active && !this.receivedResult) {
547
+ this.log("[SSM] no-result timeout fired \u2014 no onresult in 5s");
548
+ this.onError?.(
549
+ "Speech streaming started but received no results. Mic may be blocked by another audio capture."
550
+ );
551
+ }
552
+ }, NO_RESULT_TIMEOUT_MS);
553
+ }
489
554
  recognition.onresult = (e) => {
490
555
  if (this.recognition !== recognition) return;
491
556
  this.receivedResult = true;
492
557
  this.clearNoResultTimer();
558
+ if (!this.active) return;
493
559
  let final_ = "";
494
560
  let interim = "";
495
561
  for (let i = e.resultIndex; i < e.results.length; i++) {
496
562
  const t = e.results[i][0].transcript;
497
563
  if (e.results[i].isFinal) {
498
- if (i > lastFinalIndex) {
564
+ if (i > this.lastFinalIndex) {
499
565
  final_ += t;
500
- lastFinalIndex = i;
566
+ this.lastFinalIndex = i;
501
567
  }
502
568
  } else {
503
569
  interim += t;
@@ -506,8 +572,8 @@ var SpeechStreamingManager = class {
506
572
  this.log(
507
573
  `[SSM] onresult \u2014 finals: "${final_}", interim: "${interim}", accumulated: "${this.accumulated}"`
508
574
  );
509
- if (final_ && final_.trim() !== lastFinalText) {
510
- lastFinalText = final_.trim();
575
+ if (final_ && final_.trim() !== this.lastFinalText) {
576
+ this.lastFinalText = final_.trim();
511
577
  this.accumulated = this.accumulated ? this.accumulated + " " + final_.trim() : final_.trim();
512
578
  this.onTranscript?.(this.accumulated);
513
579
  } else if (interim) {
@@ -519,11 +585,13 @@ var SpeechStreamingManager = class {
519
585
  recognition.onerror = (e) => {
520
586
  if (this.recognition !== recognition) return;
521
587
  this.log(`[SSM] onerror \u2014 ${e.error}`);
522
- this.onError?.(e.error);
588
+ if (this.active) this.onError?.(e.error);
523
589
  };
524
590
  recognition.onend = () => {
525
591
  if (this.recognition !== recognition) return;
526
- this.log(`[SSM] onend \u2014 active: ${this.active}, receivedResult: ${this.receivedResult}`);
592
+ this.log(
593
+ `[SSM] onend \u2014 active: ${this.active}, keepingWarm: ${this.keepingWarm}, receivedResult: ${this.receivedResult}`
594
+ );
527
595
  if (this.active) {
528
596
  this.onPause?.();
529
597
  try {
@@ -532,10 +600,22 @@ var SpeechStreamingManager = class {
532
600
  } catch (err) {
533
601
  this.log(`[SSM] restart THREW: ${err}`);
534
602
  this.recognition = null;
603
+ this.currentLang = "";
535
604
  this.onError?.("Speech recognition failed to restart after pause.");
536
605
  }
606
+ } else if (this.keepingWarm) {
607
+ try {
608
+ recognition.start();
609
+ this.log("[SSM] warm restart");
610
+ } catch (err) {
611
+ this.log(`[SSM] warm restart THREW: ${err}`);
612
+ this.recognition = null;
613
+ this.keepingWarm = false;
614
+ this.currentLang = "";
615
+ }
537
616
  } else {
538
617
  this.recognition = null;
618
+ this.currentLang = "";
539
619
  }
540
620
  };
541
621
  this.recognition = recognition;
@@ -545,15 +625,20 @@ var SpeechStreamingManager = class {
545
625
  } catch (err) {
546
626
  this.log(`[SSM] recognition.start() THREW: ${err}`);
547
627
  this.recognition = null;
628
+ this.currentLang = "";
629
+ const wasActive = this.active;
548
630
  this.active = false;
631
+ this.keepingWarm = false;
549
632
  this.clearNoResultTimer();
550
- this.onError?.(
551
- `Speech recognition failed to start: ${err instanceof Error ? err.message : String(err)}`
552
- );
633
+ if (wasActive) {
634
+ this.onError?.(
635
+ `Speech recognition failed to start: ${err instanceof Error ? err.message : String(err)}`
636
+ );
637
+ }
553
638
  return Promise.resolve();
554
639
  }
555
- if (skipMicWait) {
556
- this.log("[SSM] skipMicWait \u2014 warm restart, returning immediately");
640
+ if (!this.active || skipMicWait) {
641
+ if (skipMicWait) this.log("[SSM] skipMicWait \u2014 warm restart, returning immediately");
557
642
  return Promise.resolve();
558
643
  }
559
644
  return micClaimPromise;
@@ -564,26 +649,50 @@ var SpeechStreamingManager = class {
564
649
  this.noResultTimer = null;
565
650
  }
566
651
  }
567
- /** Stop streaming recognition and return accumulated text. */
652
+ clearIdleTimer() {
653
+ if (this.idleTimer) {
654
+ clearTimeout(this.idleTimer);
655
+ this.idleTimer = null;
656
+ }
657
+ }
658
+ startIdleTimer() {
659
+ this.clearIdleTimer();
660
+ this.idleTimer = setTimeout(() => {
661
+ this.idleTimer = null;
662
+ this.keepingWarm = false;
663
+ this.log("[SSM] idle timeout \u2014 stopping recognition");
664
+ if (this.recognition) {
665
+ const rec = this.recognition;
666
+ this.recognition = null;
667
+ this.currentLang = "";
668
+ try {
669
+ rec.stop();
670
+ } catch {
671
+ }
672
+ }
673
+ }, IDLE_TIMEOUT_MS);
674
+ }
675
+ /** Stop streaming recognition and return accumulated text.
676
+ * Keeps the recognition session alive (muted) for instant restart. */
568
677
  stop() {
569
678
  this.active = false;
679
+ this.keepingWarm = true;
570
680
  this.clearNoResultTimer();
571
- if (this.recognition) {
572
- const rec = this.recognition;
573
- this.recognition = null;
574
- rec.stop();
575
- }
576
681
  const result = this.accumulated;
577
682
  this.accumulated = "";
683
+ this.startIdleTimer();
578
684
  return result;
579
685
  }
580
- /** Abort immediately without returning text. */
686
+ /** Abort immediately and release all resources. */
581
687
  destroy() {
582
688
  this.active = false;
689
+ this.keepingWarm = false;
583
690
  this.clearNoResultTimer();
691
+ this.clearIdleTimer();
584
692
  if (this.recognition) {
585
693
  const rec = this.recognition;
586
694
  this.recognition = null;
695
+ this.currentLang = "";
587
696
  rec.abort();
588
697
  }
589
698
  this.accumulated = "";
@@ -638,6 +747,9 @@ var STTEngine = class extends TypedEventEmitter {
638
747
  await this.workerManager.loadModel(this.config);
639
748
  this.state.isModelLoaded = true;
640
749
  this.updateStatus("ready");
750
+ if (this.config.streaming.enabled) {
751
+ this.speechStreaming.preWarm(this.config.language);
752
+ }
641
753
  } catch (err) {
642
754
  this.emitError("MODEL_LOAD_FAILED", err instanceof Error ? err.message : String(err));
643
755
  this.updateStatus("idle");
@@ -774,6 +886,7 @@ var STTEngine = class extends TypedEventEmitter {
774
886
  const text = await this.workerManager.transcribe(audio);
775
887
  if (text.trim() && this.capture && !this._stopping) {
776
888
  this.emit("correction", text);
889
+ trimAudioBuffer(this.capture, 30);
777
890
  }
778
891
  } catch (err) {
779
892
  this.emitError(