@tekyzinc/stt-component 0.3.1 → 0.3.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.cjs +150 -37
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +24 -10
- package/dist/index.d.ts +24 -10
- package/dist/index.js +150 -37
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/dist/index.d.cts
CHANGED
|
@@ -218,8 +218,13 @@ declare class SpeechStreamingManager {
|
|
|
218
218
|
private recognition;
|
|
219
219
|
private accumulated;
|
|
220
220
|
private active;
|
|
221
|
+
private keepingWarm;
|
|
222
|
+
private currentLang;
|
|
221
223
|
private receivedResult;
|
|
224
|
+
private lastFinalIndex;
|
|
225
|
+
private lastFinalText;
|
|
222
226
|
private noResultTimer;
|
|
227
|
+
private idleTimer;
|
|
223
228
|
private onTranscript;
|
|
224
229
|
private onPause;
|
|
225
230
|
private onError;
|
|
@@ -236,20 +241,29 @@ declare class SpeechStreamingManager {
|
|
|
236
241
|
setOnDebug(fn: (message: string) => void): void;
|
|
237
242
|
private log;
|
|
238
243
|
/**
|
|
239
|
-
*
|
|
240
|
-
*
|
|
241
|
-
*
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
*
|
|
246
|
-
*
|
|
244
|
+
* Pre-warm: start recognition in muted mode so it's ready before the user
|
|
245
|
+
* clicks. Call after engine.init() completes. Eliminates startup latency on
|
|
246
|
+
* first click by keeping the Google Speech session alive.
|
|
247
|
+
*/
|
|
248
|
+
preWarm(language: string): void;
|
|
249
|
+
/**
|
|
250
|
+
* Start streaming recognition. If recognition is already warm (session
|
|
251
|
+
* running from preWarm or a previous session within the idle window),
|
|
252
|
+
* activates instantly — no Google handshake. Otherwise cold-starts.
|
|
247
253
|
*/
|
|
248
254
|
start(language: string, skipMicWait?: boolean): Promise<void>;
|
|
255
|
+
/**
|
|
256
|
+
* Create and start a new SpeechRecognition instance.
|
|
257
|
+
* Used by both preWarm() (active=false) and start() cold path (active=true).
|
|
258
|
+
*/
|
|
259
|
+
private spawnRecognition;
|
|
249
260
|
private clearNoResultTimer;
|
|
250
|
-
|
|
261
|
+
private clearIdleTimer;
|
|
262
|
+
private startIdleTimer;
|
|
263
|
+
/** Stop streaming recognition and return accumulated text.
|
|
264
|
+
* Keeps the recognition session alive (muted) for instant restart. */
|
|
251
265
|
stop(): string;
|
|
252
|
-
/** Abort immediately
|
|
266
|
+
/** Abort immediately and release all resources. */
|
|
253
267
|
destroy(): void;
|
|
254
268
|
}
|
|
255
269
|
|
package/dist/index.d.ts
CHANGED
|
@@ -218,8 +218,13 @@ declare class SpeechStreamingManager {
|
|
|
218
218
|
private recognition;
|
|
219
219
|
private accumulated;
|
|
220
220
|
private active;
|
|
221
|
+
private keepingWarm;
|
|
222
|
+
private currentLang;
|
|
221
223
|
private receivedResult;
|
|
224
|
+
private lastFinalIndex;
|
|
225
|
+
private lastFinalText;
|
|
222
226
|
private noResultTimer;
|
|
227
|
+
private idleTimer;
|
|
223
228
|
private onTranscript;
|
|
224
229
|
private onPause;
|
|
225
230
|
private onError;
|
|
@@ -236,20 +241,29 @@ declare class SpeechStreamingManager {
|
|
|
236
241
|
setOnDebug(fn: (message: string) => void): void;
|
|
237
242
|
private log;
|
|
238
243
|
/**
|
|
239
|
-
*
|
|
240
|
-
*
|
|
241
|
-
*
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
*
|
|
246
|
-
*
|
|
244
|
+
* Pre-warm: start recognition in muted mode so it's ready before the user
|
|
245
|
+
* clicks. Call after engine.init() completes. Eliminates startup latency on
|
|
246
|
+
* first click by keeping the Google Speech session alive.
|
|
247
|
+
*/
|
|
248
|
+
preWarm(language: string): void;
|
|
249
|
+
/**
|
|
250
|
+
* Start streaming recognition. If recognition is already warm (session
|
|
251
|
+
* running from preWarm or a previous session within the idle window),
|
|
252
|
+
* activates instantly — no Google handshake. Otherwise cold-starts.
|
|
247
253
|
*/
|
|
248
254
|
start(language: string, skipMicWait?: boolean): Promise<void>;
|
|
255
|
+
/**
|
|
256
|
+
* Create and start a new SpeechRecognition instance.
|
|
257
|
+
* Used by both preWarm() (active=false) and start() cold path (active=true).
|
|
258
|
+
*/
|
|
259
|
+
private spawnRecognition;
|
|
249
260
|
private clearNoResultTimer;
|
|
250
|
-
|
|
261
|
+
private clearIdleTimer;
|
|
262
|
+
private startIdleTimer;
|
|
263
|
+
/** Stop streaming recognition and return accumulated text.
|
|
264
|
+
* Keeps the recognition session alive (muted) for instant restart. */
|
|
251
265
|
stop(): string;
|
|
252
|
-
/** Abort immediately
|
|
266
|
+
/** Abort immediately and release all resources. */
|
|
253
267
|
destroy(): void;
|
|
254
268
|
}
|
|
255
269
|
|
package/dist/index.js
CHANGED
|
@@ -114,6 +114,13 @@ async function resumeCapture(capture) {
|
|
|
114
114
|
function snapshotAudio(capture) {
|
|
115
115
|
return [...capture.samples];
|
|
116
116
|
}
|
|
117
|
+
function trimAudioBuffer(capture, keepSeconds) {
|
|
118
|
+
const samplesPerChunk = 4096;
|
|
119
|
+
const chunksToKeep = Math.ceil(keepSeconds * capture.audioCtx.sampleRate / samplesPerChunk);
|
|
120
|
+
if (capture.samples.length > chunksToKeep) {
|
|
121
|
+
capture.samples.splice(0, capture.samples.length - chunksToKeep);
|
|
122
|
+
}
|
|
123
|
+
}
|
|
117
124
|
async function resampleAudio(samples, nativeSr) {
|
|
118
125
|
const totalLength = samples.reduce((sum, s) => sum + s.length, 0);
|
|
119
126
|
if (totalLength === 0) return new Float32Array(0);
|
|
@@ -398,12 +405,22 @@ function toBCP47(language) {
|
|
|
398
405
|
return WHISPER_TO_BCP47[language.toLowerCase()] ?? language;
|
|
399
406
|
}
|
|
400
407
|
var NO_RESULT_TIMEOUT_MS = 5e3;
|
|
408
|
+
var IDLE_TIMEOUT_MS = 3e4;
|
|
401
409
|
var SpeechStreamingManager = class {
|
|
402
410
|
recognition = null;
|
|
403
411
|
accumulated = "";
|
|
404
412
|
active = false;
|
|
413
|
+
// user is recording — emit results, trigger onPause on end
|
|
414
|
+
keepingWarm = false;
|
|
415
|
+
// recognition running silently between user sessions
|
|
416
|
+
currentLang = "";
|
|
417
|
+
// BCP-47 of running recognition (for fast-path check)
|
|
405
418
|
receivedResult = false;
|
|
419
|
+
lastFinalIndex = -1;
|
|
420
|
+
// instance fields so warm-restart resets them cleanly
|
|
421
|
+
lastFinalText = "";
|
|
406
422
|
noResultTimer = null;
|
|
423
|
+
idleTimer = null;
|
|
407
424
|
onTranscript = null;
|
|
408
425
|
onPause = null;
|
|
409
426
|
onError = null;
|
|
@@ -433,14 +450,25 @@ var SpeechStreamingManager = class {
|
|
|
433
450
|
console.warn(message);
|
|
434
451
|
}
|
|
435
452
|
/**
|
|
436
|
-
*
|
|
437
|
-
*
|
|
438
|
-
*
|
|
439
|
-
|
|
440
|
-
|
|
441
|
-
|
|
442
|
-
|
|
443
|
-
|
|
453
|
+
* Pre-warm: start recognition in muted mode so it's ready before the user
|
|
454
|
+
* clicks. Call after engine.init() completes. Eliminates startup latency on
|
|
455
|
+
* first click by keeping the Google Speech session alive.
|
|
456
|
+
*/
|
|
457
|
+
preWarm(language) {
|
|
458
|
+
const SR = getSpeechRecognition();
|
|
459
|
+
if (!SR) return;
|
|
460
|
+
const bcp47 = toBCP47(language);
|
|
461
|
+
if (this.recognition && this.currentLang === bcp47) return;
|
|
462
|
+
this.log(`[SSM] preWarm() \u2014 lang: "${language}" \u2192 "${bcp47}"`);
|
|
463
|
+
this.keepingWarm = true;
|
|
464
|
+
this.active = false;
|
|
465
|
+
this.clearIdleTimer();
|
|
466
|
+
this.spawnRecognition(language);
|
|
467
|
+
}
|
|
468
|
+
/**
|
|
469
|
+
* Start streaming recognition. If recognition is already warm (session
|
|
470
|
+
* running from preWarm or a previous session within the idle window),
|
|
471
|
+
* activates instantly — no Google handshake. Otherwise cold-starts.
|
|
444
472
|
*/
|
|
445
473
|
start(language, skipMicWait = false) {
|
|
446
474
|
const SR = getSpeechRecognition();
|
|
@@ -451,14 +479,49 @@ var SpeechStreamingManager = class {
|
|
|
451
479
|
const bcp47 = toBCP47(language);
|
|
452
480
|
this.log(`[SSM] start() \u2014 lang: "${language}" \u2192 "${bcp47}"`);
|
|
453
481
|
this.accumulated = "";
|
|
454
|
-
this.active = true;
|
|
455
482
|
this.receivedResult = false;
|
|
483
|
+
this.lastFinalIndex = -1;
|
|
484
|
+
this.lastFinalText = "";
|
|
485
|
+
this.clearIdleTimer();
|
|
486
|
+
if (this.recognition && this.currentLang === bcp47) {
|
|
487
|
+
this.log("[SSM] start() \u2014 warm session, activating immediately");
|
|
488
|
+
this.keepingWarm = false;
|
|
489
|
+
this.active = true;
|
|
490
|
+
this.clearNoResultTimer();
|
|
491
|
+
this.noResultTimer = setTimeout(() => {
|
|
492
|
+
if (this.active && !this.receivedResult) {
|
|
493
|
+
this.log("[SSM] no-result timeout fired \u2014 no onresult in 5s");
|
|
494
|
+
this.onError?.(
|
|
495
|
+
"Speech streaming started but received no results. Mic may be blocked by another audio capture."
|
|
496
|
+
);
|
|
497
|
+
}
|
|
498
|
+
}, NO_RESULT_TIMEOUT_MS);
|
|
499
|
+
return Promise.resolve();
|
|
500
|
+
}
|
|
501
|
+
this.keepingWarm = false;
|
|
502
|
+
this.active = true;
|
|
503
|
+
return this.spawnRecognition(language, skipMicWait);
|
|
504
|
+
}
|
|
505
|
+
/**
|
|
506
|
+
* Create and start a new SpeechRecognition instance.
|
|
507
|
+
* Used by both preWarm() (active=false) and start() cold path (active=true).
|
|
508
|
+
*/
|
|
509
|
+
spawnRecognition(language, skipMicWait = false) {
|
|
510
|
+
const SR = getSpeechRecognition();
|
|
511
|
+
const bcp47 = toBCP47(language);
|
|
512
|
+
if (this.recognition) {
|
|
513
|
+
const old = this.recognition;
|
|
514
|
+
this.recognition = null;
|
|
515
|
+
try {
|
|
516
|
+
old.stop();
|
|
517
|
+
} catch {
|
|
518
|
+
}
|
|
519
|
+
}
|
|
520
|
+
this.currentLang = bcp47;
|
|
456
521
|
const recognition = new SR();
|
|
457
522
|
recognition.continuous = true;
|
|
458
523
|
recognition.interimResults = true;
|
|
459
524
|
recognition.lang = bcp47;
|
|
460
|
-
let lastFinalIndex = -1;
|
|
461
|
-
let lastFinalText = "";
|
|
462
525
|
let micReady = false;
|
|
463
526
|
const micClaimPromise = new Promise((resolve) => {
|
|
464
527
|
recognition.onaudiostart = () => {
|
|
@@ -478,26 +541,29 @@ var SpeechStreamingManager = class {
|
|
|
478
541
|
}, 300);
|
|
479
542
|
});
|
|
480
543
|
this.clearNoResultTimer();
|
|
481
|
-
this.
|
|
482
|
-
|
|
483
|
-
this.
|
|
484
|
-
|
|
485
|
-
|
|
486
|
-
|
|
487
|
-
|
|
488
|
-
|
|
544
|
+
if (this.active) {
|
|
545
|
+
this.noResultTimer = setTimeout(() => {
|
|
546
|
+
if (this.active && !this.receivedResult) {
|
|
547
|
+
this.log("[SSM] no-result timeout fired \u2014 no onresult in 5s");
|
|
548
|
+
this.onError?.(
|
|
549
|
+
"Speech streaming started but received no results. Mic may be blocked by another audio capture."
|
|
550
|
+
);
|
|
551
|
+
}
|
|
552
|
+
}, NO_RESULT_TIMEOUT_MS);
|
|
553
|
+
}
|
|
489
554
|
recognition.onresult = (e) => {
|
|
490
555
|
if (this.recognition !== recognition) return;
|
|
491
556
|
this.receivedResult = true;
|
|
492
557
|
this.clearNoResultTimer();
|
|
558
|
+
if (!this.active) return;
|
|
493
559
|
let final_ = "";
|
|
494
560
|
let interim = "";
|
|
495
561
|
for (let i = e.resultIndex; i < e.results.length; i++) {
|
|
496
562
|
const t = e.results[i][0].transcript;
|
|
497
563
|
if (e.results[i].isFinal) {
|
|
498
|
-
if (i > lastFinalIndex) {
|
|
564
|
+
if (i > this.lastFinalIndex) {
|
|
499
565
|
final_ += t;
|
|
500
|
-
lastFinalIndex = i;
|
|
566
|
+
this.lastFinalIndex = i;
|
|
501
567
|
}
|
|
502
568
|
} else {
|
|
503
569
|
interim += t;
|
|
@@ -506,8 +572,8 @@ var SpeechStreamingManager = class {
|
|
|
506
572
|
this.log(
|
|
507
573
|
`[SSM] onresult \u2014 finals: "${final_}", interim: "${interim}", accumulated: "${this.accumulated}"`
|
|
508
574
|
);
|
|
509
|
-
if (final_ && final_.trim() !== lastFinalText) {
|
|
510
|
-
lastFinalText = final_.trim();
|
|
575
|
+
if (final_ && final_.trim() !== this.lastFinalText) {
|
|
576
|
+
this.lastFinalText = final_.trim();
|
|
511
577
|
this.accumulated = this.accumulated ? this.accumulated + " " + final_.trim() : final_.trim();
|
|
512
578
|
this.onTranscript?.(this.accumulated);
|
|
513
579
|
} else if (interim) {
|
|
@@ -519,11 +585,13 @@ var SpeechStreamingManager = class {
|
|
|
519
585
|
recognition.onerror = (e) => {
|
|
520
586
|
if (this.recognition !== recognition) return;
|
|
521
587
|
this.log(`[SSM] onerror \u2014 ${e.error}`);
|
|
522
|
-
this.onError?.(e.error);
|
|
588
|
+
if (this.active) this.onError?.(e.error);
|
|
523
589
|
};
|
|
524
590
|
recognition.onend = () => {
|
|
525
591
|
if (this.recognition !== recognition) return;
|
|
526
|
-
this.log(
|
|
592
|
+
this.log(
|
|
593
|
+
`[SSM] onend \u2014 active: ${this.active}, keepingWarm: ${this.keepingWarm}, receivedResult: ${this.receivedResult}`
|
|
594
|
+
);
|
|
527
595
|
if (this.active) {
|
|
528
596
|
this.onPause?.();
|
|
529
597
|
try {
|
|
@@ -532,10 +600,22 @@ var SpeechStreamingManager = class {
|
|
|
532
600
|
} catch (err) {
|
|
533
601
|
this.log(`[SSM] restart THREW: ${err}`);
|
|
534
602
|
this.recognition = null;
|
|
603
|
+
this.currentLang = "";
|
|
535
604
|
this.onError?.("Speech recognition failed to restart after pause.");
|
|
536
605
|
}
|
|
606
|
+
} else if (this.keepingWarm) {
|
|
607
|
+
try {
|
|
608
|
+
recognition.start();
|
|
609
|
+
this.log("[SSM] warm restart");
|
|
610
|
+
} catch (err) {
|
|
611
|
+
this.log(`[SSM] warm restart THREW: ${err}`);
|
|
612
|
+
this.recognition = null;
|
|
613
|
+
this.keepingWarm = false;
|
|
614
|
+
this.currentLang = "";
|
|
615
|
+
}
|
|
537
616
|
} else {
|
|
538
617
|
this.recognition = null;
|
|
618
|
+
this.currentLang = "";
|
|
539
619
|
}
|
|
540
620
|
};
|
|
541
621
|
this.recognition = recognition;
|
|
@@ -545,15 +625,20 @@ var SpeechStreamingManager = class {
|
|
|
545
625
|
} catch (err) {
|
|
546
626
|
this.log(`[SSM] recognition.start() THREW: ${err}`);
|
|
547
627
|
this.recognition = null;
|
|
628
|
+
this.currentLang = "";
|
|
629
|
+
const wasActive = this.active;
|
|
548
630
|
this.active = false;
|
|
631
|
+
this.keepingWarm = false;
|
|
549
632
|
this.clearNoResultTimer();
|
|
550
|
-
|
|
551
|
-
|
|
552
|
-
|
|
633
|
+
if (wasActive) {
|
|
634
|
+
this.onError?.(
|
|
635
|
+
`Speech recognition failed to start: ${err instanceof Error ? err.message : String(err)}`
|
|
636
|
+
);
|
|
637
|
+
}
|
|
553
638
|
return Promise.resolve();
|
|
554
639
|
}
|
|
555
|
-
if (skipMicWait) {
|
|
556
|
-
this.log("[SSM] skipMicWait \u2014 warm restart, returning immediately");
|
|
640
|
+
if (!this.active || skipMicWait) {
|
|
641
|
+
if (skipMicWait) this.log("[SSM] skipMicWait \u2014 warm restart, returning immediately");
|
|
557
642
|
return Promise.resolve();
|
|
558
643
|
}
|
|
559
644
|
return micClaimPromise;
|
|
@@ -564,26 +649,50 @@ var SpeechStreamingManager = class {
|
|
|
564
649
|
this.noResultTimer = null;
|
|
565
650
|
}
|
|
566
651
|
}
|
|
567
|
-
|
|
652
|
+
clearIdleTimer() {
|
|
653
|
+
if (this.idleTimer) {
|
|
654
|
+
clearTimeout(this.idleTimer);
|
|
655
|
+
this.idleTimer = null;
|
|
656
|
+
}
|
|
657
|
+
}
|
|
658
|
+
startIdleTimer() {
|
|
659
|
+
this.clearIdleTimer();
|
|
660
|
+
this.idleTimer = setTimeout(() => {
|
|
661
|
+
this.idleTimer = null;
|
|
662
|
+
this.keepingWarm = false;
|
|
663
|
+
this.log("[SSM] idle timeout \u2014 stopping recognition");
|
|
664
|
+
if (this.recognition) {
|
|
665
|
+
const rec = this.recognition;
|
|
666
|
+
this.recognition = null;
|
|
667
|
+
this.currentLang = "";
|
|
668
|
+
try {
|
|
669
|
+
rec.stop();
|
|
670
|
+
} catch {
|
|
671
|
+
}
|
|
672
|
+
}
|
|
673
|
+
}, IDLE_TIMEOUT_MS);
|
|
674
|
+
}
|
|
675
|
+
/** Stop streaming recognition and return accumulated text.
|
|
676
|
+
* Keeps the recognition session alive (muted) for instant restart. */
|
|
568
677
|
stop() {
|
|
569
678
|
this.active = false;
|
|
679
|
+
this.keepingWarm = true;
|
|
570
680
|
this.clearNoResultTimer();
|
|
571
|
-
if (this.recognition) {
|
|
572
|
-
const rec = this.recognition;
|
|
573
|
-
this.recognition = null;
|
|
574
|
-
rec.stop();
|
|
575
|
-
}
|
|
576
681
|
const result = this.accumulated;
|
|
577
682
|
this.accumulated = "";
|
|
683
|
+
this.startIdleTimer();
|
|
578
684
|
return result;
|
|
579
685
|
}
|
|
580
|
-
/** Abort immediately
|
|
686
|
+
/** Abort immediately and release all resources. */
|
|
581
687
|
destroy() {
|
|
582
688
|
this.active = false;
|
|
689
|
+
this.keepingWarm = false;
|
|
583
690
|
this.clearNoResultTimer();
|
|
691
|
+
this.clearIdleTimer();
|
|
584
692
|
if (this.recognition) {
|
|
585
693
|
const rec = this.recognition;
|
|
586
694
|
this.recognition = null;
|
|
695
|
+
this.currentLang = "";
|
|
587
696
|
rec.abort();
|
|
588
697
|
}
|
|
589
698
|
this.accumulated = "";
|
|
@@ -638,6 +747,9 @@ var STTEngine = class extends TypedEventEmitter {
|
|
|
638
747
|
await this.workerManager.loadModel(this.config);
|
|
639
748
|
this.state.isModelLoaded = true;
|
|
640
749
|
this.updateStatus("ready");
|
|
750
|
+
if (this.config.streaming.enabled) {
|
|
751
|
+
this.speechStreaming.preWarm(this.config.language);
|
|
752
|
+
}
|
|
641
753
|
} catch (err) {
|
|
642
754
|
this.emitError("MODEL_LOAD_FAILED", err instanceof Error ? err.message : String(err));
|
|
643
755
|
this.updateStatus("idle");
|
|
@@ -774,6 +886,7 @@ var STTEngine = class extends TypedEventEmitter {
|
|
|
774
886
|
const text = await this.workerManager.transcribe(audio);
|
|
775
887
|
if (text.trim() && this.capture && !this._stopping) {
|
|
776
888
|
this.emit("correction", text);
|
|
889
|
+
trimAudioBuffer(this.capture, 30);
|
|
777
890
|
}
|
|
778
891
|
} catch (err) {
|
|
779
892
|
this.emitError(
|