@tekyzinc/stt-component 0.3.0 → 0.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.cts CHANGED
@@ -188,12 +188,9 @@ declare class WorkerManager extends TypedEventEmitter<WorkerManagerEvents> {
188
188
  private handleMessage;
189
189
  }
190
190
 
191
- /**
192
- * Manages mid-recording correction timing.
193
- * Two triggers: pause detection and forced interval.
194
- */
195
191
  declare class CorrectionOrchestrator {
196
192
  private forcedTimer;
193
+ private initialTimer;
197
194
  private lastCorrectionTime;
198
195
  private correctionFn;
199
196
  private config;
@@ -201,7 +198,9 @@ declare class CorrectionOrchestrator {
201
198
  constructor(config: ResolvedSTTConfig['correction']);
202
199
  /** Set the function to call when a correction is triggered. */
203
200
  setCorrectionFn(fn: () => void): void;
204
- /** Start the correction orchestrator (begin forced interval timer). */
201
+ /** Start the correction orchestrator.
202
+ * Fires a quick initial correction after 1s for early feedback, then
203
+ * switches to the regular forcedInterval cadence from that point. */
205
204
  start(): void;
206
205
  /** Stop the orchestrator (clear all timers). */
207
206
  stop(): void;
@@ -219,8 +218,13 @@ declare class SpeechStreamingManager {
219
218
  private recognition;
220
219
  private accumulated;
221
220
  private active;
221
+ private keepingWarm;
222
+ private currentLang;
222
223
  private receivedResult;
224
+ private lastFinalIndex;
225
+ private lastFinalText;
223
226
  private noResultTimer;
227
+ private idleTimer;
224
228
  private onTranscript;
225
229
  private onPause;
226
230
  private onError;
@@ -237,20 +241,29 @@ declare class SpeechStreamingManager {
237
241
  setOnDebug(fn: (message: string) => void): void;
238
242
  private log;
239
243
  /**
240
- * Start streaming recognition. Returns a Promise that resolves once
241
- * SpeechRecognition has claimed the microphone (onaudiostart) or after
242
- * a 300ms fallback whichever comes first. The engine should await
243
- * this before calling getUserMedia to avoid dual-mic conflicts.
244
- *
245
- * When skipMicWait is true (warm restart — mic already active), returns
246
- * immediately after calling recognition.start() without waiting for
247
- * onaudiostart.
244
+ * Pre-warm: start recognition in muted mode so it's ready before the user
245
+ * clicks. Call after engine.init() completes. Eliminates startup latency on
246
+ * first click by keeping the Google Speech session alive.
247
+ */
248
+ preWarm(language: string): void;
249
+ /**
250
+ * Start streaming recognition. If recognition is already warm (session
251
+ * running from preWarm or a previous session within the idle window),
252
+ * activates instantly — no Google handshake. Otherwise cold-starts.
248
253
  */
249
254
  start(language: string, skipMicWait?: boolean): Promise<void>;
255
+ /**
256
+ * Create and start a new SpeechRecognition instance.
257
+ * Used by both preWarm() (active=false) and start() cold path (active=true).
258
+ */
259
+ private spawnRecognition;
250
260
  private clearNoResultTimer;
251
- /** Stop streaming recognition and return accumulated text. */
261
+ private clearIdleTimer;
262
+ private startIdleTimer;
263
+ /** Stop streaming recognition and return accumulated text.
264
+ * Keeps the recognition session alive (muted) for instant restart. */
252
265
  stop(): string;
253
- /** Abort immediately without returning text. */
266
+ /** Abort immediately and release all resources. */
254
267
  destroy(): void;
255
268
  }
256
269
 
package/dist/index.d.ts CHANGED
@@ -188,12 +188,9 @@ declare class WorkerManager extends TypedEventEmitter<WorkerManagerEvents> {
188
188
  private handleMessage;
189
189
  }
190
190
 
191
- /**
192
- * Manages mid-recording correction timing.
193
- * Two triggers: pause detection and forced interval.
194
- */
195
191
  declare class CorrectionOrchestrator {
196
192
  private forcedTimer;
193
+ private initialTimer;
197
194
  private lastCorrectionTime;
198
195
  private correctionFn;
199
196
  private config;
@@ -201,7 +198,9 @@ declare class CorrectionOrchestrator {
201
198
  constructor(config: ResolvedSTTConfig['correction']);
202
199
  /** Set the function to call when a correction is triggered. */
203
200
  setCorrectionFn(fn: () => void): void;
204
- /** Start the correction orchestrator (begin forced interval timer). */
201
+ /** Start the correction orchestrator.
202
+ * Fires a quick initial correction after 1s for early feedback, then
203
+ * switches to the regular forcedInterval cadence from that point. */
205
204
  start(): void;
206
205
  /** Stop the orchestrator (clear all timers). */
207
206
  stop(): void;
@@ -219,8 +218,13 @@ declare class SpeechStreamingManager {
219
218
  private recognition;
220
219
  private accumulated;
221
220
  private active;
221
+ private keepingWarm;
222
+ private currentLang;
222
223
  private receivedResult;
224
+ private lastFinalIndex;
225
+ private lastFinalText;
223
226
  private noResultTimer;
227
+ private idleTimer;
224
228
  private onTranscript;
225
229
  private onPause;
226
230
  private onError;
@@ -237,20 +241,29 @@ declare class SpeechStreamingManager {
237
241
  setOnDebug(fn: (message: string) => void): void;
238
242
  private log;
239
243
  /**
240
- * Start streaming recognition. Returns a Promise that resolves once
241
- * SpeechRecognition has claimed the microphone (onaudiostart) or after
242
- * a 300ms fallback whichever comes first. The engine should await
243
- * this before calling getUserMedia to avoid dual-mic conflicts.
244
- *
245
- * When skipMicWait is true (warm restart — mic already active), returns
246
- * immediately after calling recognition.start() without waiting for
247
- * onaudiostart.
244
+ * Pre-warm: start recognition in muted mode so it's ready before the user
245
+ * clicks. Call after engine.init() completes. Eliminates startup latency on
246
+ * first click by keeping the Google Speech session alive.
247
+ */
248
+ preWarm(language: string): void;
249
+ /**
250
+ * Start streaming recognition. If recognition is already warm (session
251
+ * running from preWarm or a previous session within the idle window),
252
+ * activates instantly — no Google handshake. Otherwise cold-starts.
248
253
  */
249
254
  start(language: string, skipMicWait?: boolean): Promise<void>;
255
+ /**
256
+ * Create and start a new SpeechRecognition instance.
257
+ * Used by both preWarm() (active=false) and start() cold path (active=true).
258
+ */
259
+ private spawnRecognition;
250
260
  private clearNoResultTimer;
251
- /** Stop streaming recognition and return accumulated text. */
261
+ private clearIdleTimer;
262
+ private startIdleTimer;
263
+ /** Stop streaming recognition and return accumulated text.
264
+ * Keeps the recognition session alive (muted) for instant restart. */
252
265
  stop(): string;
253
- /** Abort immediately without returning text. */
266
+ /** Abort immediately and release all resources. */
254
267
  destroy(): void;
255
268
  }
256
269
 
package/dist/index.js CHANGED
@@ -256,8 +256,10 @@ var WorkerManager = class extends TypedEventEmitter {
256
256
  };
257
257
 
258
258
  // src/correction-orchestrator.ts
259
+ var INITIAL_CORRECTION_DELAY_MS = 1e3;
259
260
  var CorrectionOrchestrator = class {
260
261
  forcedTimer = null;
262
+ initialTimer = null;
261
263
  lastCorrectionTime = 0;
262
264
  correctionFn = null;
263
265
  config;
@@ -269,14 +271,25 @@ var CorrectionOrchestrator = class {
269
271
  setCorrectionFn(fn) {
270
272
  this.correctionFn = fn;
271
273
  }
272
- /** Start the correction orchestrator (begin forced interval timer). */
274
+ /** Start the correction orchestrator.
275
+ * Fires a quick initial correction after 1s for early feedback, then
276
+ * switches to the regular forcedInterval cadence from that point. */
273
277
  start() {
274
278
  if (!this.config.enabled) return;
275
279
  this.lastCorrectionTime = Date.now();
276
- this.startForcedTimer();
280
+ this.initialTimer = setTimeout(() => {
281
+ this.initialTimer = null;
282
+ this.correctionFn?.();
283
+ this.lastCorrectionTime = Date.now();
284
+ this.startForcedTimer();
285
+ }, INITIAL_CORRECTION_DELAY_MS);
277
286
  }
278
287
  /** Stop the orchestrator (clear all timers). */
279
288
  stop() {
289
+ if (this.initialTimer) {
290
+ clearTimeout(this.initialTimer);
291
+ this.initialTimer = null;
292
+ }
280
293
  this.stopForcedTimer();
281
294
  }
282
295
  /** Called when a speech pause is detected. Triggers correction if cooldown elapsed. */
@@ -385,12 +398,22 @@ function toBCP47(language) {
385
398
  return WHISPER_TO_BCP47[language.toLowerCase()] ?? language;
386
399
  }
387
400
  var NO_RESULT_TIMEOUT_MS = 5e3;
401
+ var IDLE_TIMEOUT_MS = 3e4;
388
402
  var SpeechStreamingManager = class {
389
403
  recognition = null;
390
404
  accumulated = "";
391
405
  active = false;
406
+ // user is recording — emit results, trigger onPause on end
407
+ keepingWarm = false;
408
+ // recognition running silently between user sessions
409
+ currentLang = "";
410
+ // BCP-47 of running recognition (for fast-path check)
392
411
  receivedResult = false;
412
+ lastFinalIndex = -1;
413
+ // instance fields so warm-restart resets them cleanly
414
+ lastFinalText = "";
393
415
  noResultTimer = null;
416
+ idleTimer = null;
394
417
  onTranscript = null;
395
418
  onPause = null;
396
419
  onError = null;
@@ -420,14 +443,25 @@ var SpeechStreamingManager = class {
420
443
  console.warn(message);
421
444
  }
422
445
  /**
423
- * Start streaming recognition. Returns a Promise that resolves once
424
- * SpeechRecognition has claimed the microphone (onaudiostart) or after
425
- * a 300ms fallback whichever comes first. The engine should await
426
- * this before calling getUserMedia to avoid dual-mic conflicts.
427
- *
428
- * When skipMicWait is true (warm restart — mic already active), returns
429
- * immediately after calling recognition.start() without waiting for
430
- * onaudiostart.
446
+ * Pre-warm: start recognition in muted mode so it's ready before the user
447
+ * clicks. Call after engine.init() completes. Eliminates startup latency on
448
+ * first click by keeping the Google Speech session alive.
449
+ */
450
+ preWarm(language) {
451
+ const SR = getSpeechRecognition();
452
+ if (!SR) return;
453
+ const bcp47 = toBCP47(language);
454
+ if (this.recognition && this.currentLang === bcp47) return;
455
+ this.log(`[SSM] preWarm() \u2014 lang: "${language}" \u2192 "${bcp47}"`);
456
+ this.keepingWarm = true;
457
+ this.active = false;
458
+ this.clearIdleTimer();
459
+ this.spawnRecognition(language);
460
+ }
461
+ /**
462
+ * Start streaming recognition. If recognition is already warm (session
463
+ * running from preWarm or a previous session within the idle window),
464
+ * activates instantly — no Google handshake. Otherwise cold-starts.
431
465
  */
432
466
  start(language, skipMicWait = false) {
433
467
  const SR = getSpeechRecognition();
@@ -438,14 +472,49 @@ var SpeechStreamingManager = class {
438
472
  const bcp47 = toBCP47(language);
439
473
  this.log(`[SSM] start() \u2014 lang: "${language}" \u2192 "${bcp47}"`);
440
474
  this.accumulated = "";
441
- this.active = true;
442
475
  this.receivedResult = false;
476
+ this.lastFinalIndex = -1;
477
+ this.lastFinalText = "";
478
+ this.clearIdleTimer();
479
+ if (this.recognition && this.currentLang === bcp47) {
480
+ this.log("[SSM] start() \u2014 warm session, activating immediately");
481
+ this.keepingWarm = false;
482
+ this.active = true;
483
+ this.clearNoResultTimer();
484
+ this.noResultTimer = setTimeout(() => {
485
+ if (this.active && !this.receivedResult) {
486
+ this.log("[SSM] no-result timeout fired \u2014 no onresult in 5s");
487
+ this.onError?.(
488
+ "Speech streaming started but received no results. Mic may be blocked by another audio capture."
489
+ );
490
+ }
491
+ }, NO_RESULT_TIMEOUT_MS);
492
+ return Promise.resolve();
493
+ }
494
+ this.keepingWarm = false;
495
+ this.active = true;
496
+ return this.spawnRecognition(language, skipMicWait);
497
+ }
498
+ /**
499
+ * Create and start a new SpeechRecognition instance.
500
+ * Used by both preWarm() (active=false) and start() cold path (active=true).
501
+ */
502
+ spawnRecognition(language, skipMicWait = false) {
503
+ const SR = getSpeechRecognition();
504
+ const bcp47 = toBCP47(language);
505
+ if (this.recognition) {
506
+ const old = this.recognition;
507
+ this.recognition = null;
508
+ try {
509
+ old.stop();
510
+ } catch {
511
+ }
512
+ }
513
+ this.currentLang = bcp47;
443
514
  const recognition = new SR();
444
515
  recognition.continuous = true;
445
516
  recognition.interimResults = true;
446
517
  recognition.lang = bcp47;
447
- let lastFinalIndex = -1;
448
- let lastFinalText = "";
449
518
  let micReady = false;
450
519
  const micClaimPromise = new Promise((resolve) => {
451
520
  recognition.onaudiostart = () => {
@@ -465,26 +534,29 @@ var SpeechStreamingManager = class {
465
534
  }, 300);
466
535
  });
467
536
  this.clearNoResultTimer();
468
- this.noResultTimer = setTimeout(() => {
469
- if (this.active && !this.receivedResult) {
470
- this.log("[SSM] no-result timeout fired \u2014 no onresult in 5s");
471
- this.onError?.(
472
- "Speech streaming started but received no results. Mic may be blocked by another audio capture."
473
- );
474
- }
475
- }, NO_RESULT_TIMEOUT_MS);
537
+ if (this.active) {
538
+ this.noResultTimer = setTimeout(() => {
539
+ if (this.active && !this.receivedResult) {
540
+ this.log("[SSM] no-result timeout fired \u2014 no onresult in 5s");
541
+ this.onError?.(
542
+ "Speech streaming started but received no results. Mic may be blocked by another audio capture."
543
+ );
544
+ }
545
+ }, NO_RESULT_TIMEOUT_MS);
546
+ }
476
547
  recognition.onresult = (e) => {
477
548
  if (this.recognition !== recognition) return;
478
549
  this.receivedResult = true;
479
550
  this.clearNoResultTimer();
551
+ if (!this.active) return;
480
552
  let final_ = "";
481
553
  let interim = "";
482
554
  for (let i = e.resultIndex; i < e.results.length; i++) {
483
555
  const t = e.results[i][0].transcript;
484
556
  if (e.results[i].isFinal) {
485
- if (i > lastFinalIndex) {
557
+ if (i > this.lastFinalIndex) {
486
558
  final_ += t;
487
- lastFinalIndex = i;
559
+ this.lastFinalIndex = i;
488
560
  }
489
561
  } else {
490
562
  interim += t;
@@ -493,8 +565,8 @@ var SpeechStreamingManager = class {
493
565
  this.log(
494
566
  `[SSM] onresult \u2014 finals: "${final_}", interim: "${interim}", accumulated: "${this.accumulated}"`
495
567
  );
496
- if (final_ && final_.trim() !== lastFinalText) {
497
- lastFinalText = final_.trim();
568
+ if (final_ && final_.trim() !== this.lastFinalText) {
569
+ this.lastFinalText = final_.trim();
498
570
  this.accumulated = this.accumulated ? this.accumulated + " " + final_.trim() : final_.trim();
499
571
  this.onTranscript?.(this.accumulated);
500
572
  } else if (interim) {
@@ -506,11 +578,13 @@ var SpeechStreamingManager = class {
506
578
  recognition.onerror = (e) => {
507
579
  if (this.recognition !== recognition) return;
508
580
  this.log(`[SSM] onerror \u2014 ${e.error}`);
509
- this.onError?.(e.error);
581
+ if (this.active) this.onError?.(e.error);
510
582
  };
511
583
  recognition.onend = () => {
512
584
  if (this.recognition !== recognition) return;
513
- this.log(`[SSM] onend \u2014 active: ${this.active}, receivedResult: ${this.receivedResult}`);
585
+ this.log(
586
+ `[SSM] onend \u2014 active: ${this.active}, keepingWarm: ${this.keepingWarm}, receivedResult: ${this.receivedResult}`
587
+ );
514
588
  if (this.active) {
515
589
  this.onPause?.();
516
590
  try {
@@ -519,10 +593,22 @@ var SpeechStreamingManager = class {
519
593
  } catch (err) {
520
594
  this.log(`[SSM] restart THREW: ${err}`);
521
595
  this.recognition = null;
596
+ this.currentLang = "";
522
597
  this.onError?.("Speech recognition failed to restart after pause.");
523
598
  }
599
+ } else if (this.keepingWarm) {
600
+ try {
601
+ recognition.start();
602
+ this.log("[SSM] warm restart");
603
+ } catch (err) {
604
+ this.log(`[SSM] warm restart THREW: ${err}`);
605
+ this.recognition = null;
606
+ this.keepingWarm = false;
607
+ this.currentLang = "";
608
+ }
524
609
  } else {
525
610
  this.recognition = null;
611
+ this.currentLang = "";
526
612
  }
527
613
  };
528
614
  this.recognition = recognition;
@@ -532,15 +618,20 @@ var SpeechStreamingManager = class {
532
618
  } catch (err) {
533
619
  this.log(`[SSM] recognition.start() THREW: ${err}`);
534
620
  this.recognition = null;
621
+ this.currentLang = "";
622
+ const wasActive = this.active;
535
623
  this.active = false;
624
+ this.keepingWarm = false;
536
625
  this.clearNoResultTimer();
537
- this.onError?.(
538
- `Speech recognition failed to start: ${err instanceof Error ? err.message : String(err)}`
539
- );
626
+ if (wasActive) {
627
+ this.onError?.(
628
+ `Speech recognition failed to start: ${err instanceof Error ? err.message : String(err)}`
629
+ );
630
+ }
540
631
  return Promise.resolve();
541
632
  }
542
- if (skipMicWait) {
543
- this.log("[SSM] skipMicWait \u2014 warm restart, returning immediately");
633
+ if (!this.active || skipMicWait) {
634
+ if (skipMicWait) this.log("[SSM] skipMicWait \u2014 warm restart, returning immediately");
544
635
  return Promise.resolve();
545
636
  }
546
637
  return micClaimPromise;
@@ -551,26 +642,50 @@ var SpeechStreamingManager = class {
551
642
  this.noResultTimer = null;
552
643
  }
553
644
  }
554
- /** Stop streaming recognition and return accumulated text. */
645
+ clearIdleTimer() {
646
+ if (this.idleTimer) {
647
+ clearTimeout(this.idleTimer);
648
+ this.idleTimer = null;
649
+ }
650
+ }
651
+ startIdleTimer() {
652
+ this.clearIdleTimer();
653
+ this.idleTimer = setTimeout(() => {
654
+ this.idleTimer = null;
655
+ this.keepingWarm = false;
656
+ this.log("[SSM] idle timeout \u2014 stopping recognition");
657
+ if (this.recognition) {
658
+ const rec = this.recognition;
659
+ this.recognition = null;
660
+ this.currentLang = "";
661
+ try {
662
+ rec.stop();
663
+ } catch {
664
+ }
665
+ }
666
+ }, IDLE_TIMEOUT_MS);
667
+ }
668
+ /** Stop streaming recognition and return accumulated text.
669
+ * Keeps the recognition session alive (muted) for instant restart. */
555
670
  stop() {
556
671
  this.active = false;
672
+ this.keepingWarm = true;
557
673
  this.clearNoResultTimer();
558
- if (this.recognition) {
559
- const rec = this.recognition;
560
- this.recognition = null;
561
- rec.stop();
562
- }
563
674
  const result = this.accumulated;
564
675
  this.accumulated = "";
676
+ this.startIdleTimer();
565
677
  return result;
566
678
  }
567
- /** Abort immediately without returning text. */
679
+ /** Abort immediately and release all resources. */
568
680
  destroy() {
569
681
  this.active = false;
682
+ this.keepingWarm = false;
570
683
  this.clearNoResultTimer();
684
+ this.clearIdleTimer();
571
685
  if (this.recognition) {
572
686
  const rec = this.recognition;
573
687
  this.recognition = null;
688
+ this.currentLang = "";
574
689
  rec.abort();
575
690
  }
576
691
  this.accumulated = "";
@@ -625,6 +740,9 @@ var STTEngine = class extends TypedEventEmitter {
625
740
  await this.workerManager.loadModel(this.config);
626
741
  this.state.isModelLoaded = true;
627
742
  this.updateStatus("ready");
743
+ if (this.config.streaming.enabled) {
744
+ this.speechStreaming.preWarm(this.config.language);
745
+ }
628
746
  } catch (err) {
629
747
  this.emitError("MODEL_LOAD_FAILED", err instanceof Error ? err.message : String(err));
630
748
  this.updateStatus("idle");