@tekyzinc/stt-component 0.3.1 → 0.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.cts CHANGED
@@ -218,8 +218,13 @@ declare class SpeechStreamingManager {
218
218
  private recognition;
219
219
  private accumulated;
220
220
  private active;
221
+ private keepingWarm;
222
+ private currentLang;
221
223
  private receivedResult;
224
+ private lastFinalIndex;
225
+ private lastFinalText;
222
226
  private noResultTimer;
227
+ private idleTimer;
223
228
  private onTranscript;
224
229
  private onPause;
225
230
  private onError;
@@ -236,20 +241,29 @@ declare class SpeechStreamingManager {
236
241
  setOnDebug(fn: (message: string) => void): void;
237
242
  private log;
238
243
  /**
239
- * Start streaming recognition. Returns a Promise that resolves once
240
- * SpeechRecognition has claimed the microphone (onaudiostart) or after
241
- * a 300ms fallback whichever comes first. The engine should await
242
- * this before calling getUserMedia to avoid dual-mic conflicts.
243
- *
244
- * When skipMicWait is true (warm restart — mic already active), returns
245
- * immediately after calling recognition.start() without waiting for
246
- * onaudiostart.
244
+ * Pre-warm: start recognition in muted mode so it's ready before the user
245
+ * clicks. Call after engine.init() completes. Eliminates startup latency on
246
+ * first click by keeping the Google Speech session alive.
247
+ */
248
+ preWarm(language: string): void;
249
+ /**
250
+ * Start streaming recognition. If recognition is already warm (session
251
+ * running from preWarm or a previous session within the idle window),
252
+ * activates instantly — no Google handshake. Otherwise cold-starts.
247
253
  */
248
254
  start(language: string, skipMicWait?: boolean): Promise<void>;
255
+ /**
256
+ * Create and start a new SpeechRecognition instance.
257
+ * Used by both preWarm() (active=false) and start() cold path (active=true).
258
+ */
259
+ private spawnRecognition;
249
260
  private clearNoResultTimer;
250
- /** Stop streaming recognition and return accumulated text. */
261
+ private clearIdleTimer;
262
+ private startIdleTimer;
263
+ /** Stop streaming recognition and return accumulated text.
264
+ * Keeps the recognition session alive (muted) for instant restart. */
251
265
  stop(): string;
252
- /** Abort immediately without returning text. */
266
+ /** Abort immediately and release all resources. */
253
267
  destroy(): void;
254
268
  }
255
269
 
package/dist/index.d.ts CHANGED
@@ -218,8 +218,13 @@ declare class SpeechStreamingManager {
218
218
  private recognition;
219
219
  private accumulated;
220
220
  private active;
221
+ private keepingWarm;
222
+ private currentLang;
221
223
  private receivedResult;
224
+ private lastFinalIndex;
225
+ private lastFinalText;
222
226
  private noResultTimer;
227
+ private idleTimer;
223
228
  private onTranscript;
224
229
  private onPause;
225
230
  private onError;
@@ -236,20 +241,29 @@ declare class SpeechStreamingManager {
236
241
  setOnDebug(fn: (message: string) => void): void;
237
242
  private log;
238
243
  /**
239
- * Start streaming recognition. Returns a Promise that resolves once
240
- * SpeechRecognition has claimed the microphone (onaudiostart) or after
241
- * a 300ms fallback whichever comes first. The engine should await
242
- * this before calling getUserMedia to avoid dual-mic conflicts.
243
- *
244
- * When skipMicWait is true (warm restart — mic already active), returns
245
- * immediately after calling recognition.start() without waiting for
246
- * onaudiostart.
244
+ * Pre-warm: start recognition in muted mode so it's ready before the user
245
+ * clicks. Call after engine.init() completes. Eliminates startup latency on
246
+ * first click by keeping the Google Speech session alive.
247
+ */
248
+ preWarm(language: string): void;
249
+ /**
250
+ * Start streaming recognition. If recognition is already warm (session
251
+ * running from preWarm or a previous session within the idle window),
252
+ * activates instantly — no Google handshake. Otherwise cold-starts.
247
253
  */
248
254
  start(language: string, skipMicWait?: boolean): Promise<void>;
255
+ /**
256
+ * Create and start a new SpeechRecognition instance.
257
+ * Used by both preWarm() (active=false) and start() cold path (active=true).
258
+ */
259
+ private spawnRecognition;
249
260
  private clearNoResultTimer;
250
- /** Stop streaming recognition and return accumulated text. */
261
+ private clearIdleTimer;
262
+ private startIdleTimer;
263
+ /** Stop streaming recognition and return accumulated text.
264
+ * Keeps the recognition session alive (muted) for instant restart. */
251
265
  stop(): string;
252
- /** Abort immediately without returning text. */
266
+ /** Abort immediately and release all resources. */
253
267
  destroy(): void;
254
268
  }
255
269
 
package/dist/index.js CHANGED
@@ -398,12 +398,22 @@ function toBCP47(language) {
398
398
  return WHISPER_TO_BCP47[language.toLowerCase()] ?? language;
399
399
  }
400
400
  var NO_RESULT_TIMEOUT_MS = 5e3;
401
+ var IDLE_TIMEOUT_MS = 3e4;
401
402
  var SpeechStreamingManager = class {
402
403
  recognition = null;
403
404
  accumulated = "";
404
405
  active = false;
406
+ // user is recording — emit results, trigger onPause on end
407
+ keepingWarm = false;
408
+ // recognition running silently between user sessions
409
+ currentLang = "";
410
+ // BCP-47 of running recognition (for fast-path check)
405
411
  receivedResult = false;
412
+ lastFinalIndex = -1;
413
+ // instance fields so warm-restart resets them cleanly
414
+ lastFinalText = "";
406
415
  noResultTimer = null;
416
+ idleTimer = null;
407
417
  onTranscript = null;
408
418
  onPause = null;
409
419
  onError = null;
@@ -433,14 +443,25 @@ var SpeechStreamingManager = class {
433
443
  console.warn(message);
434
444
  }
435
445
  /**
436
- * Start streaming recognition. Returns a Promise that resolves once
437
- * SpeechRecognition has claimed the microphone (onaudiostart) or after
438
- * a 300ms fallback whichever comes first. The engine should await
439
- * this before calling getUserMedia to avoid dual-mic conflicts.
440
- *
441
- * When skipMicWait is true (warm restart — mic already active), returns
442
- * immediately after calling recognition.start() without waiting for
443
- * onaudiostart.
446
+ * Pre-warm: start recognition in muted mode so it's ready before the user
447
+ * clicks. Call after engine.init() completes. Eliminates startup latency on
448
+ * first click by keeping the Google Speech session alive.
449
+ */
450
+ preWarm(language) {
451
+ const SR = getSpeechRecognition();
452
+ if (!SR) return;
453
+ const bcp47 = toBCP47(language);
454
+ if (this.recognition && this.currentLang === bcp47) return;
455
+ this.log(`[SSM] preWarm() \u2014 lang: "${language}" \u2192 "${bcp47}"`);
456
+ this.keepingWarm = true;
457
+ this.active = false;
458
+ this.clearIdleTimer();
459
+ this.spawnRecognition(language);
460
+ }
461
+ /**
462
+ * Start streaming recognition. If recognition is already warm (session
463
+ * running from preWarm or a previous session within the idle window),
464
+ * activates instantly — no Google handshake. Otherwise cold-starts.
444
465
  */
445
466
  start(language, skipMicWait = false) {
446
467
  const SR = getSpeechRecognition();
@@ -451,14 +472,49 @@ var SpeechStreamingManager = class {
451
472
  const bcp47 = toBCP47(language);
452
473
  this.log(`[SSM] start() \u2014 lang: "${language}" \u2192 "${bcp47}"`);
453
474
  this.accumulated = "";
454
- this.active = true;
455
475
  this.receivedResult = false;
476
+ this.lastFinalIndex = -1;
477
+ this.lastFinalText = "";
478
+ this.clearIdleTimer();
479
+ if (this.recognition && this.currentLang === bcp47) {
480
+ this.log("[SSM] start() \u2014 warm session, activating immediately");
481
+ this.keepingWarm = false;
482
+ this.active = true;
483
+ this.clearNoResultTimer();
484
+ this.noResultTimer = setTimeout(() => {
485
+ if (this.active && !this.receivedResult) {
486
+ this.log("[SSM] no-result timeout fired \u2014 no onresult in 5s");
487
+ this.onError?.(
488
+ "Speech streaming started but received no results. Mic may be blocked by another audio capture."
489
+ );
490
+ }
491
+ }, NO_RESULT_TIMEOUT_MS);
492
+ return Promise.resolve();
493
+ }
494
+ this.keepingWarm = false;
495
+ this.active = true;
496
+ return this.spawnRecognition(language, skipMicWait);
497
+ }
498
+ /**
499
+ * Create and start a new SpeechRecognition instance.
500
+ * Used by both preWarm() (active=false) and start() cold path (active=true).
501
+ */
502
+ spawnRecognition(language, skipMicWait = false) {
503
+ const SR = getSpeechRecognition();
504
+ const bcp47 = toBCP47(language);
505
+ if (this.recognition) {
506
+ const old = this.recognition;
507
+ this.recognition = null;
508
+ try {
509
+ old.stop();
510
+ } catch {
511
+ }
512
+ }
513
+ this.currentLang = bcp47;
456
514
  const recognition = new SR();
457
515
  recognition.continuous = true;
458
516
  recognition.interimResults = true;
459
517
  recognition.lang = bcp47;
460
- let lastFinalIndex = -1;
461
- let lastFinalText = "";
462
518
  let micReady = false;
463
519
  const micClaimPromise = new Promise((resolve) => {
464
520
  recognition.onaudiostart = () => {
@@ -478,26 +534,29 @@ var SpeechStreamingManager = class {
478
534
  }, 300);
479
535
  });
480
536
  this.clearNoResultTimer();
481
- this.noResultTimer = setTimeout(() => {
482
- if (this.active && !this.receivedResult) {
483
- this.log("[SSM] no-result timeout fired \u2014 no onresult in 5s");
484
- this.onError?.(
485
- "Speech streaming started but received no results. Mic may be blocked by another audio capture."
486
- );
487
- }
488
- }, NO_RESULT_TIMEOUT_MS);
537
+ if (this.active) {
538
+ this.noResultTimer = setTimeout(() => {
539
+ if (this.active && !this.receivedResult) {
540
+ this.log("[SSM] no-result timeout fired \u2014 no onresult in 5s");
541
+ this.onError?.(
542
+ "Speech streaming started but received no results. Mic may be blocked by another audio capture."
543
+ );
544
+ }
545
+ }, NO_RESULT_TIMEOUT_MS);
546
+ }
489
547
  recognition.onresult = (e) => {
490
548
  if (this.recognition !== recognition) return;
491
549
  this.receivedResult = true;
492
550
  this.clearNoResultTimer();
551
+ if (!this.active) return;
493
552
  let final_ = "";
494
553
  let interim = "";
495
554
  for (let i = e.resultIndex; i < e.results.length; i++) {
496
555
  const t = e.results[i][0].transcript;
497
556
  if (e.results[i].isFinal) {
498
- if (i > lastFinalIndex) {
557
+ if (i > this.lastFinalIndex) {
499
558
  final_ += t;
500
- lastFinalIndex = i;
559
+ this.lastFinalIndex = i;
501
560
  }
502
561
  } else {
503
562
  interim += t;
@@ -506,8 +565,8 @@ var SpeechStreamingManager = class {
506
565
  this.log(
507
566
  `[SSM] onresult \u2014 finals: "${final_}", interim: "${interim}", accumulated: "${this.accumulated}"`
508
567
  );
509
- if (final_ && final_.trim() !== lastFinalText) {
510
- lastFinalText = final_.trim();
568
+ if (final_ && final_.trim() !== this.lastFinalText) {
569
+ this.lastFinalText = final_.trim();
511
570
  this.accumulated = this.accumulated ? this.accumulated + " " + final_.trim() : final_.trim();
512
571
  this.onTranscript?.(this.accumulated);
513
572
  } else if (interim) {
@@ -519,11 +578,13 @@ var SpeechStreamingManager = class {
519
578
  recognition.onerror = (e) => {
520
579
  if (this.recognition !== recognition) return;
521
580
  this.log(`[SSM] onerror \u2014 ${e.error}`);
522
- this.onError?.(e.error);
581
+ if (this.active) this.onError?.(e.error);
523
582
  };
524
583
  recognition.onend = () => {
525
584
  if (this.recognition !== recognition) return;
526
- this.log(`[SSM] onend \u2014 active: ${this.active}, receivedResult: ${this.receivedResult}`);
585
+ this.log(
586
+ `[SSM] onend \u2014 active: ${this.active}, keepingWarm: ${this.keepingWarm}, receivedResult: ${this.receivedResult}`
587
+ );
527
588
  if (this.active) {
528
589
  this.onPause?.();
529
590
  try {
@@ -532,10 +593,22 @@ var SpeechStreamingManager = class {
532
593
  } catch (err) {
533
594
  this.log(`[SSM] restart THREW: ${err}`);
534
595
  this.recognition = null;
596
+ this.currentLang = "";
535
597
  this.onError?.("Speech recognition failed to restart after pause.");
536
598
  }
599
+ } else if (this.keepingWarm) {
600
+ try {
601
+ recognition.start();
602
+ this.log("[SSM] warm restart");
603
+ } catch (err) {
604
+ this.log(`[SSM] warm restart THREW: ${err}`);
605
+ this.recognition = null;
606
+ this.keepingWarm = false;
607
+ this.currentLang = "";
608
+ }
537
609
  } else {
538
610
  this.recognition = null;
611
+ this.currentLang = "";
539
612
  }
540
613
  };
541
614
  this.recognition = recognition;
@@ -545,15 +618,20 @@ var SpeechStreamingManager = class {
545
618
  } catch (err) {
546
619
  this.log(`[SSM] recognition.start() THREW: ${err}`);
547
620
  this.recognition = null;
621
+ this.currentLang = "";
622
+ const wasActive = this.active;
548
623
  this.active = false;
624
+ this.keepingWarm = false;
549
625
  this.clearNoResultTimer();
550
- this.onError?.(
551
- `Speech recognition failed to start: ${err instanceof Error ? err.message : String(err)}`
552
- );
626
+ if (wasActive) {
627
+ this.onError?.(
628
+ `Speech recognition failed to start: ${err instanceof Error ? err.message : String(err)}`
629
+ );
630
+ }
553
631
  return Promise.resolve();
554
632
  }
555
- if (skipMicWait) {
556
- this.log("[SSM] skipMicWait \u2014 warm restart, returning immediately");
633
+ if (!this.active || skipMicWait) {
634
+ if (skipMicWait) this.log("[SSM] skipMicWait \u2014 warm restart, returning immediately");
557
635
  return Promise.resolve();
558
636
  }
559
637
  return micClaimPromise;
@@ -564,26 +642,50 @@ var SpeechStreamingManager = class {
564
642
  this.noResultTimer = null;
565
643
  }
566
644
  }
567
- /** Stop streaming recognition and return accumulated text. */
645
+ clearIdleTimer() {
646
+ if (this.idleTimer) {
647
+ clearTimeout(this.idleTimer);
648
+ this.idleTimer = null;
649
+ }
650
+ }
651
+ startIdleTimer() {
652
+ this.clearIdleTimer();
653
+ this.idleTimer = setTimeout(() => {
654
+ this.idleTimer = null;
655
+ this.keepingWarm = false;
656
+ this.log("[SSM] idle timeout \u2014 stopping recognition");
657
+ if (this.recognition) {
658
+ const rec = this.recognition;
659
+ this.recognition = null;
660
+ this.currentLang = "";
661
+ try {
662
+ rec.stop();
663
+ } catch {
664
+ }
665
+ }
666
+ }, IDLE_TIMEOUT_MS);
667
+ }
668
+ /** Stop streaming recognition and return accumulated text.
669
+ * Keeps the recognition session alive (muted) for instant restart. */
568
670
  stop() {
569
671
  this.active = false;
672
+ this.keepingWarm = true;
570
673
  this.clearNoResultTimer();
571
- if (this.recognition) {
572
- const rec = this.recognition;
573
- this.recognition = null;
574
- rec.stop();
575
- }
576
674
  const result = this.accumulated;
577
675
  this.accumulated = "";
676
+ this.startIdleTimer();
578
677
  return result;
579
678
  }
580
- /** Abort immediately without returning text. */
679
+ /** Abort immediately and release all resources. */
581
680
  destroy() {
582
681
  this.active = false;
682
+ this.keepingWarm = false;
583
683
  this.clearNoResultTimer();
684
+ this.clearIdleTimer();
584
685
  if (this.recognition) {
585
686
  const rec = this.recognition;
586
687
  this.recognition = null;
688
+ this.currentLang = "";
587
689
  rec.abort();
588
690
  }
589
691
  this.accumulated = "";
@@ -638,6 +740,9 @@ var STTEngine = class extends TypedEventEmitter {
638
740
  await this.workerManager.loadModel(this.config);
639
741
  this.state.isModelLoaded = true;
640
742
  this.updateStatus("ready");
743
+ if (this.config.streaming.enabled) {
744
+ this.speechStreaming.preWarm(this.config.language);
745
+ }
641
746
  } catch (err) {
642
747
  this.emitError("MODEL_LOAD_FAILED", err instanceof Error ? err.message : String(err));
643
748
  this.updateStatus("idle");