speech-to-speech 0.1.2 β 0.1.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +218 -6
- package/dist/index-D3jSI59z.d.cts +266 -0
- package/dist/index-DArRlgIf.d.ts +266 -0
- package/dist/index.cjs +256 -35
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +116 -2
- package/dist/index.d.ts +116 -2
- package/dist/index.mjs +250 -16
- package/dist/index.mjs.map +1 -1
- package/dist/stt/index.cjs +104 -35
- package/dist/stt/index.cjs.map +1 -1
- package/dist/stt/index.d.cts +2 -241
- package/dist/stt/index.d.ts +2 -241
- package/dist/stt/index.mjs +104 -16
- package/dist/stt/index.mjs.map +1 -1
- package/dist/tts/index.cjs +168 -35
- package/dist/tts/index.cjs.map +1 -1
- package/dist/tts/index.d.cts +100 -1
- package/dist/tts/index.d.ts +100 -1
- package/dist/tts/index.mjs +164 -16
- package/dist/tts/index.mjs.map +1 -1
- package/package.json +1 -1
package/README.md
CHANGED
|
@@ -6,6 +6,7 @@ TypeScript utilities for speech-to-text (STT) and text-to-speech (TTS) in the br
|
|
|
6
6
|
|
|
7
7
|
- π€ **STT**: Browser-native speech recognition with session management
|
|
8
8
|
- π **TTS**: Piper neural TTS with automatic model downloading
|
|
9
|
+
- β‘ **WASM Caching**: Automatic browser caching eliminates repeated downloads
|
|
9
10
|
- π΅ **Shared Audio Queue**: Auto-play audio queue for seamless playback
|
|
10
11
|
- β
**Zero Config**: No manual ONNX setup required - everything is handled automatically
|
|
11
12
|
- π¦ **Small**: ~135KB package size
|
|
@@ -37,7 +38,7 @@ stt.start();
|
|
|
37
38
|
|
|
38
39
|
// Text-to-Speech with auto-play queue
|
|
39
40
|
const tts = new TTSLogic({ voiceId: "en_US-hfc_female-medium" });
|
|
40
|
-
await tts.initialize();
|
|
41
|
+
await tts.initialize(); // WASM files cached automatically
|
|
41
42
|
|
|
42
43
|
const result = await tts.synthesize("Hello world!");
|
|
43
44
|
sharedAudioPlayer.addAudioIntoQueue(result.audio, result.sampleRate);
|
|
@@ -255,24 +256,41 @@ export default function SpeechComponent() {
|
|
|
255
256
|
## Exports
|
|
256
257
|
|
|
257
258
|
```typescript
|
|
258
|
-
// Main bundle (STT + TTS)
|
|
259
|
+
// Main bundle (STT + TTS + Service wrapper)
|
|
259
260
|
import {
|
|
261
|
+
// Service wrapper (new in 0.1.4)
|
|
262
|
+
createSpeechService,
|
|
263
|
+
// STT
|
|
260
264
|
STTLogic,
|
|
265
|
+
getCompatibilityInfo,
|
|
266
|
+
// TTS
|
|
261
267
|
TTSLogic,
|
|
268
|
+
prefetchTTSModel,
|
|
269
|
+
cleanTextForTTS,
|
|
262
270
|
AudioPlayer,
|
|
263
271
|
createAudioPlayer,
|
|
264
272
|
sharedAudioPlayer,
|
|
265
273
|
} from "speech-to-speech";
|
|
266
274
|
|
|
267
275
|
// STT only
|
|
268
|
-
import {
|
|
276
|
+
import {
|
|
277
|
+
STTLogic,
|
|
278
|
+
ResetSTTLogic,
|
|
279
|
+
VADController,
|
|
280
|
+
getCompatibilityInfo, // new in 0.1.4
|
|
281
|
+
} from "speech-to-speech/stt";
|
|
269
282
|
|
|
270
283
|
// TTS only
|
|
271
284
|
import {
|
|
272
285
|
TTSLogic,
|
|
286
|
+
prefetchTTSModel, // new in 0.1.4
|
|
287
|
+
cleanTextForTTS, // new in 0.1.4
|
|
273
288
|
AudioPlayer,
|
|
274
289
|
createAudioPlayer,
|
|
275
290
|
sharedAudioPlayer,
|
|
291
|
+
ensureWasmCached,
|
|
292
|
+
isWasmCached,
|
|
293
|
+
clearWasmCache,
|
|
276
294
|
} from "speech-to-speech/tts";
|
|
277
295
|
```
|
|
278
296
|
|
|
@@ -323,7 +341,8 @@ Piper TTS synthesizer. Voice models download automatically on first use.
|
|
|
323
341
|
```typescript
|
|
324
342
|
const tts = new TTSLogic({
|
|
325
343
|
voiceId: "en_US-hfc_female-medium", // Piper voice ID
|
|
326
|
-
warmUp: true,
|
|
344
|
+
warmUp: true, // Pre-warm the model (default: true)
|
|
345
|
+
enableWasmCache: true, // Cache WASM assets (default: true)
|
|
327
346
|
});
|
|
328
347
|
await tts.initialize();
|
|
329
348
|
|
|
@@ -341,6 +360,51 @@ await tts.synthesizeAndAddToQueue("Hello world!");
|
|
|
341
360
|
await tts.dispose();
|
|
342
361
|
```
|
|
343
362
|
|
|
363
|
+
#### WASM Caching (New in 0.1.3)
|
|
364
|
+
|
|
365
|
+
The library automatically caches `piper_phonemize.data` (~9MB) and `piper_phonemize.wasm` in the browser Cache API. This eliminates repeated network downloads on every synthesis call.
|
|
366
|
+
|
|
367
|
+
**Zero-config (recommended):**
|
|
368
|
+
```typescript
|
|
369
|
+
const tts = new TTSLogic({ voiceId: "en_US-hfc_female-medium" });
|
|
370
|
+
await tts.initialize();
|
|
371
|
+
// WASM files cached automatically after first download
|
|
372
|
+
```
|
|
373
|
+
|
|
374
|
+
**Self-hosted WASM files:**
|
|
375
|
+
```typescript
|
|
376
|
+
const tts = new TTSLogic({
|
|
377
|
+
voiceId: "en_US-hfc_female-medium",
|
|
378
|
+
wasmPaths: {
|
|
379
|
+
piperData: "/piper-wasm/piper_phonemize.data",
|
|
380
|
+
piperWasm: "/piper-wasm/piper_phonemize.wasm",
|
|
381
|
+
onnxWasm: "/ort/ort-wasm-simd.wasm", // optional
|
|
382
|
+
},
|
|
383
|
+
});
|
|
384
|
+
```
|
|
385
|
+
|
|
386
|
+
**Disable caching:**
|
|
387
|
+
```typescript
|
|
388
|
+
const tts = new TTSLogic({
|
|
389
|
+
voiceId: "en_US-hfc_female-medium",
|
|
390
|
+
enableWasmCache: false, // Uses CDN URLs directly
|
|
391
|
+
});
|
|
392
|
+
```
|
|
393
|
+
|
|
394
|
+
**Utility functions:**
|
|
395
|
+
```typescript
|
|
396
|
+
import { ensureWasmCached, isWasmCached, clearWasmCache } from "speech-to-speech/tts";
|
|
397
|
+
|
|
398
|
+
// Prefetch WASM assets before initialization
|
|
399
|
+
await ensureWasmCached(); // Returns { piperData: blob:..., piperWasm: blob:... }
|
|
400
|
+
|
|
401
|
+
// Check if cached
|
|
402
|
+
const cached = await isWasmCached(); // true/false
|
|
403
|
+
|
|
404
|
+
// Clear cache
|
|
405
|
+
await clearWasmCache();
|
|
406
|
+
```
|
|
407
|
+
|
|
344
408
|
### Audio Playback
|
|
345
409
|
|
|
346
410
|
#### `sharedAudioPlayer` (Recommended)
|
|
@@ -569,9 +633,137 @@ function stop() {
|
|
|
569
633
|
}
|
|
570
634
|
```
|
|
571
635
|
|
|
636
|
+
## Unified Speech Service
|
|
637
|
+
|
|
638
|
+
`createSpeechService()` wires STT and TTS together so you need fewer imports and no manual callback plumbing.
|
|
639
|
+
|
|
640
|
+
```ts
|
|
641
|
+
import { createSpeechService } from "speech-to-speech";
|
|
642
|
+
|
|
643
|
+
const service = createSpeechService();
|
|
644
|
+
|
|
645
|
+
// 1. Set up STT
|
|
646
|
+
service.initializeSTT({
|
|
647
|
+
onTranscript: (text) => console.log("Final:", text),
|
|
648
|
+
onInterimTranscript: (text) => setLiveCaption(text), // real-time display
|
|
649
|
+
onWordsUpdate: (words) => console.log("Words so far:", words),
|
|
650
|
+
onStatusChange: (type, data) => {
|
|
651
|
+
if (type === "speaking") setUserSpeaking(data as boolean);
|
|
652
|
+
},
|
|
653
|
+
});
|
|
654
|
+
|
|
655
|
+
// 2. Set up TTS (awaitable)
|
|
656
|
+
await service.initializeTTS({ voiceId: "en_US-hfc_female-medium" });
|
|
657
|
+
|
|
658
|
+
// 3. Start session
|
|
659
|
+
service.startListening();
|
|
660
|
+
await service.speak("Hello, how can I help you?");
|
|
661
|
+
|
|
662
|
+
// 4. End session
|
|
663
|
+
const transcript = service.stopListening();
|
|
664
|
+
service.stopSpeaking();
|
|
665
|
+
```
|
|
666
|
+
|
|
667
|
+
---
|
|
668
|
+
|
|
669
|
+
## Interim Transcript Streaming
|
|
670
|
+
|
|
671
|
+
Get real-time partial results while the user is still speaking. Pass `onInterimTranscript` directly to `initializeSTT()`:
|
|
672
|
+
|
|
673
|
+
```ts
|
|
674
|
+
import { createSpeechService } from "speech-to-speech";
|
|
675
|
+
|
|
676
|
+
const service = createSpeechService();
|
|
677
|
+
|
|
678
|
+
service.initializeSTT({
|
|
679
|
+
onTranscript: (finalText) => console.log("Final:", finalText),
|
|
680
|
+
onInterimTranscript: (partialText) => {
|
|
681
|
+
// Called on every interim result β great for live captions
|
|
682
|
+
liveCaption.textContent = partialText;
|
|
683
|
+
},
|
|
684
|
+
});
|
|
685
|
+
|
|
686
|
+
await service.initializeTTS({ voiceId: "en_US-hfc_female-medium" });
|
|
687
|
+
service.startListening();
|
|
688
|
+
```
|
|
689
|
+
|
|
690
|
+
---
|
|
691
|
+
|
|
692
|
+
## TTS Warmup
|
|
693
|
+
|
|
694
|
+
Call `prefetchTTSModel()` early in your app boot (e.g. after page load) so the first `speak()` call has no cold-start delay:
|
|
695
|
+
|
|
696
|
+
```ts
|
|
697
|
+
import { prefetchTTSModel } from "speech-to-speech";
|
|
698
|
+
|
|
699
|
+
// Fire-and-forget β safe to call before the user interacts
|
|
700
|
+
prefetchTTSModel("en_US-hfc_female-medium");
|
|
701
|
+
|
|
702
|
+
// Later, when the user actually triggers speech:
|
|
703
|
+
const tts = new TTSLogic({ voiceId: "en_US-hfc_female-medium" });
|
|
704
|
+
await tts.initialize(); // instant β model already cached
|
|
705
|
+
```
|
|
706
|
+
|
|
707
|
+
---
|
|
708
|
+
|
|
709
|
+
## Browser Compatibility Check
|
|
710
|
+
|
|
711
|
+
Gate your UI before attempting to start STT or TTS:
|
|
712
|
+
|
|
713
|
+
```ts
|
|
714
|
+
import { getCompatibilityInfo } from "speech-to-speech";
|
|
715
|
+
|
|
716
|
+
const { stt, tts, browser } = getCompatibilityInfo();
|
|
717
|
+
|
|
718
|
+
if (!stt) {
|
|
719
|
+
showBanner(`Speech input is not supported in ${browser}. Please use Chrome or Edge.`);
|
|
720
|
+
}
|
|
721
|
+
if (!tts) {
|
|
722
|
+
showBanner("Text-to-speech is not supported in this browser.");
|
|
723
|
+
}
|
|
724
|
+
```
|
|
725
|
+
|
|
726
|
+
---
|
|
727
|
+
|
|
728
|
+
## Text Cleanup for TTS
|
|
729
|
+
|
|
730
|
+
Strip HTML, Markdown, and emoji from LLM responses before passing them to synthesis:
|
|
731
|
+
|
|
732
|
+
```ts
|
|
733
|
+
import { cleanTextForTTS } from "speech-to-speech";
|
|
734
|
+
|
|
735
|
+
const raw = "**Hello** <b>world</b>! Here's a [link](https://example.com) π";
|
|
736
|
+
const spoken = cleanTextForTTS(raw);
|
|
737
|
+
// β "Hello world Here's a link"
|
|
738
|
+
|
|
739
|
+
// Or opt-out of individual steps:
|
|
740
|
+
const spoken2 = cleanTextForTTS(raw, { removeEmojis: false });
|
|
741
|
+
// β "Hello world Here's a link π"
|
|
742
|
+
```
|
|
743
|
+
|
|
744
|
+
---
|
|
745
|
+
|
|
746
|
+
## Audio Player Status Callbacks
|
|
747
|
+
|
|
748
|
+
React to playback state changes without polling:
|
|
749
|
+
|
|
750
|
+
```ts
|
|
751
|
+
import { sharedAudioPlayer } from "speech-to-speech";
|
|
752
|
+
|
|
753
|
+
sharedAudioPlayer.setStatusCallback((status) => {
|
|
754
|
+
console.log("[TTS]", status); // e.g. "Playing audio chunk 1"
|
|
755
|
+
});
|
|
756
|
+
|
|
757
|
+
sharedAudioPlayer.setPlayingChangeCallback((isPlaying) => {
|
|
758
|
+
setTTSIndicator(isPlaying); // show/hide a speaking indicator in UI
|
|
759
|
+
});
|
|
760
|
+
```
|
|
761
|
+
|
|
762
|
+
---
|
|
763
|
+
|
|
572
764
|
## Available Piper Voices
|
|
573
765
|
|
|
574
|
-
Voice models are downloaded automatically from CDN on first use (~20-80MB per voice).
|
|
766
|
+
Voice models are downloaded automatically from CDN on first use (~20-80MB per voice). WASM files (~9MB) are cached automatically and reused across all voices.
|
|
575
767
|
|
|
576
768
|
| Voice ID | Language | Description |
|
|
577
769
|
| ------------------------- | ------------ | ------------------------------ |
|
|
@@ -602,7 +794,8 @@ See [Piper Voices](https://rhasspy.github.io/piper-samples/) for the complete li
|
|
|
602
794
|
| Issue | Solution |
|
|
603
795
|
| -------------------- | --------------------------------------------------------------------------------------- |
|
|
604
796
|
| "Voice not found" | Check voice ID spelling. Use `en_US-hfc_female-medium` for testing. |
|
|
605
|
-
| Slow first synthesis | Normal - voice model (~20MB)
|
|
797
|
+
| Slow first synthesis | Normal - voice model (~20MB) and WASM files (~9MB) download on first use. Subsequent calls use cached assets. |
|
|
798
|
+
| Repeated WASM downloads | Ensure `enableWasmCache: true` (default). Check browser Cache API support. |
|
|
606
799
|
| No audio output | Ensure browser supports Web Audio API. Check volume and audio permissions. |
|
|
607
800
|
| CORS errors | Ensure Vite config has proper COOP/COEP headers (see above). |
|
|
608
801
|
|
|
@@ -647,6 +840,25 @@ npm run clean # Remove dist/
|
|
|
647
840
|
- **[Web Speech API](https://developer.mozilla.org/en-US/docs/Web/API/Web_Speech_API)** - Browser speech recognition
|
|
648
841
|
- **[Web Audio API](https://developer.mozilla.org/en-US/docs/Web/API/Web_Audio_API)** - Audio processing
|
|
649
842
|
|
|
843
|
+
## Changelog
|
|
844
|
+
|
|
845
|
+
### v0.1.4
|
|
846
|
+
|
|
847
|
+
- **`createSpeechService()`** β Unified service wrapper that wires STT + TTS together with a single ergonomic API. Supports `initializeSTT`, `initializeTTS`, `startListening`, `stopListening`, `speak`, `stopSpeaking`, and `getCompatibilityInfo`.
|
|
848
|
+
- **`onInterimTranscript`** β New option in `STTLogic` (and `createSpeechService().initializeSTT()`) to receive real-time partial transcript updates while the user is still speaking.
|
|
849
|
+
- **`prefetchTTSModel(voiceId)`** β Pre-warm a Piper voice early in app boot to eliminate cold-start latency on the first `speak()` call.
|
|
850
|
+
- **`getCompatibilityInfo()`** β Returns `{ stt, tts, browser }` for browser feature detection and UI gating.
|
|
851
|
+
- **`cleanTextForTTS(text, options?)`** β Strips HTML, Markdown, and emoji from text before synthesis. Options: `stripHtml`, `stripMarkdown`, `removeEmojis` (all default `true`).
|
|
852
|
+
|
|
853
|
+
### v0.1.3
|
|
854
|
+
|
|
855
|
+
- Automatic WASM caching via the browser Cache API β `piper_phonemize.data` (~9MB) and `piper_phonemize.wasm` are fetched once and reused across sessions.
|
|
856
|
+
- `ensureWasmCached`, `isWasmCached`, `clearWasmCache` utility functions.
|
|
857
|
+
- `enableWasmCache` and `wasmPaths` options on `TTSLogic` for self-hosted WASM.
|
|
858
|
+
- Speech-aware audio player β queue automatically pauses while the user is speaking.
|
|
859
|
+
|
|
860
|
+
---
|
|
861
|
+
|
|
650
862
|
## License
|
|
651
863
|
|
|
652
864
|
MIT
|
|
@@ -0,0 +1,266 @@
|
|
|
1
|
+
import { F as FillerManager } from './filler-manager-hZwzWKVC.cjs';
|
|
2
|
+
|
|
3
|
+
/**
|
|
4
|
+
* stt-tts-lib - Speech-to-Text and Text-to-Speech Library
|
|
5
|
+
* Copyright (C) 2026 Navgurukul
|
|
6
|
+
*
|
|
7
|
+
* This program is free software: you can redistribute it and/or modify
|
|
8
|
+
* it under the terms of the GNU Affero General Public License as published by
|
|
9
|
+
* the Free Software Foundation, either version 3 of the License, or
|
|
10
|
+
* (at your option) any later version.
|
|
11
|
+
*
|
|
12
|
+
* This program is distributed in the hope that it will be useful,
|
|
13
|
+
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
14
|
+
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
15
|
+
* GNU Affero General Public License for more details.
|
|
16
|
+
*
|
|
17
|
+
* You should have received a copy of the GNU Affero General Public License
|
|
18
|
+
* along with this program. If not, see <https://www.gnu.org/licenses/>.
|
|
19
|
+
*/
|
|
20
|
+
type ResetReason = "silence" | "utterance-complete" | "manual";
|
|
21
|
+
interface ResetStats {
|
|
22
|
+
utteranceStartedAt: number;
|
|
23
|
+
lastActivityAt: number;
|
|
24
|
+
partialTranscript: string;
|
|
25
|
+
}
|
|
26
|
+
interface ResetSTTOptions$1 {
|
|
27
|
+
/** Maximum silence (ms) allowed before forcing a reset. */
|
|
28
|
+
maxSilenceMs?: number;
|
|
29
|
+
/** Maximum utterance length (ms) before rotating to a fresh buffer. */
|
|
30
|
+
maxUtteranceMs?: number;
|
|
31
|
+
/** Optional reset hook for logging/analytics. */
|
|
32
|
+
onReset?: (reason: ResetReason, stats: ResetStats) => void;
|
|
33
|
+
/**
|
|
34
|
+
* Supply a clock for deterministic tests; defaults to Date.now.
|
|
35
|
+
* Using a function keeps the class platform-neutral.
|
|
36
|
+
*/
|
|
37
|
+
now?: () => number;
|
|
38
|
+
}
|
|
39
|
+
/**
|
|
40
|
+
* Tracks speech activity and decides when to reset an STT pipeline so tokens and streams do not grow unbounded.
|
|
41
|
+
*/
|
|
42
|
+
declare class ResetSTTLogic$1 {
|
|
43
|
+
private readonly maxSilenceMs;
|
|
44
|
+
private readonly maxUtteranceMs;
|
|
45
|
+
private readonly onReset?;
|
|
46
|
+
private readonly now;
|
|
47
|
+
private utteranceStartedAt;
|
|
48
|
+
private lastActivityAt;
|
|
49
|
+
private partialTranscript;
|
|
50
|
+
constructor(options?: ResetSTTOptions$1);
|
|
51
|
+
recordSpeechActivity(timestamp?: number): void;
|
|
52
|
+
updatePartialTranscript(partial: string, timestamp?: number): void;
|
|
53
|
+
shouldReset(timestamp?: number): ResetReason | null;
|
|
54
|
+
maybeReset(timestamp?: number): ResetReason | null;
|
|
55
|
+
forceReset(reason?: ResetReason, timestamp?: number): void;
|
|
56
|
+
private reset;
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
/**
|
|
60
|
+
* stt-tts-lib - Speech-to-Text and Text-to-Speech Library
|
|
61
|
+
* Copyright (C) 2026 Navgurukul
|
|
62
|
+
*
|
|
63
|
+
* This program is free software: you can redistribute it and/or modify
|
|
64
|
+
* it under the terms of the GNU Affero General Public License as published by
|
|
65
|
+
* the Free Software Foundation, either version 3 of the License, or
|
|
66
|
+
* (at your option) any later version.
|
|
67
|
+
*
|
|
68
|
+
* This program is distributed in the hope that it will be useful,
|
|
69
|
+
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
70
|
+
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
71
|
+
* GNU Affero General Public License for more details.
|
|
72
|
+
*
|
|
73
|
+
* You should have received a copy of the GNU Affero General Public License
|
|
74
|
+
* along with this program. If not, see <https://www.gnu.org/licenses/>.
|
|
75
|
+
*/
|
|
76
|
+
type VADControllerOptions = {
|
|
77
|
+
bufferSize?: number;
|
|
78
|
+
minSpeechMs?: number;
|
|
79
|
+
minSilenceMs?: number;
|
|
80
|
+
energyThreshold?: number;
|
|
81
|
+
dynamicThresholdFactor?: number;
|
|
82
|
+
noiseFloorSmoothing?: number;
|
|
83
|
+
noiseFloorDecay?: number;
|
|
84
|
+
maxAmplitude?: number;
|
|
85
|
+
};
|
|
86
|
+
declare class VADController {
|
|
87
|
+
private vad;
|
|
88
|
+
private voiceStartListeners;
|
|
89
|
+
private voiceStopListeners;
|
|
90
|
+
private running;
|
|
91
|
+
private options?;
|
|
92
|
+
constructor(options?: VADControllerOptions);
|
|
93
|
+
start(): Promise<void>;
|
|
94
|
+
stop(): void;
|
|
95
|
+
destroy(): void;
|
|
96
|
+
isActive(): boolean;
|
|
97
|
+
onVoiceStart(listener: () => void): () => void;
|
|
98
|
+
onVoiceStop(listener: () => void): () => void;
|
|
99
|
+
private emitVoiceStart;
|
|
100
|
+
private emitVoiceStop;
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
/**
|
|
104
|
+
* stt-tts-lib - Speech-to-Text and Text-to-Speech Library
|
|
105
|
+
* Copyright (C) 2026 Navgurukul
|
|
106
|
+
*
|
|
107
|
+
* This program is free software: you can redistribute it and/or modify
|
|
108
|
+
* it under the terms of the GNU Affero General Public License as published by
|
|
109
|
+
* the Free Software Foundation, either version 3 of the License, or
|
|
110
|
+
* (at your option) any later version.
|
|
111
|
+
*
|
|
112
|
+
* This program is distributed in the hope that it will be useful,
|
|
113
|
+
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
114
|
+
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
115
|
+
* GNU Affero General Public License for more details.
|
|
116
|
+
*
|
|
117
|
+
* You should have received a copy of the GNU Affero General Public License
|
|
118
|
+
* along with this program. If not, see <https://www.gnu.org/licenses/>.
|
|
119
|
+
*/
|
|
120
|
+
|
|
121
|
+
type WordUpdateCallback = (words: string[]) => void;
|
|
122
|
+
type MicTimeUpdateCallback = (ms: number) => void;
|
|
123
|
+
type RestartMetricsCallback = (count: number, lastDuration: number | null) => void;
|
|
124
|
+
type VadCallbacks = {
|
|
125
|
+
onSpeechStart?: () => void;
|
|
126
|
+
onSpeechEnd?: () => void;
|
|
127
|
+
};
|
|
128
|
+
type LogCallback = (message: string, type?: "info" | "error" | "warning") => void;
|
|
129
|
+
type TranscriptCallback = (transcript: string) => void;
|
|
130
|
+
interface ResetSTTOptions {
|
|
131
|
+
sessionDurationMs?: number;
|
|
132
|
+
interimSaveIntervalMs?: number;
|
|
133
|
+
preserveTranscriptOnStart?: boolean;
|
|
134
|
+
/** Enable short filler (default: false) */
|
|
135
|
+
enableShortFiller?: boolean;
|
|
136
|
+
/** Enable long filler (default: false) */
|
|
137
|
+
enableLongFiller?: boolean;
|
|
138
|
+
/** Delay before short filler in ms (default: 5000) */
|
|
139
|
+
shortFillerDelayMs?: number;
|
|
140
|
+
/** Delay before long filler in ms (default: 10000) */
|
|
141
|
+
longFillerDelayMs?: number;
|
|
142
|
+
/** Fallback short filler if LLM fails */
|
|
143
|
+
shortFillerFallback?: string;
|
|
144
|
+
/** Fallback long filler if LLM fails */
|
|
145
|
+
longFillerFallback?: string;
|
|
146
|
+
/** Callback when filler is generated */
|
|
147
|
+
onFillerGenerated?: (type: "short" | "long", text: string) => void;
|
|
148
|
+
/** LLM API URL (required for dynamic filler generation) */
|
|
149
|
+
llmApiUrl?: string;
|
|
150
|
+
/** LLM API Key */
|
|
151
|
+
llmApiKey?: string;
|
|
152
|
+
/** LLM Model name (default: "deepseek-chat") */
|
|
153
|
+
llmModel?: string;
|
|
154
|
+
/** LLM request timeout in ms (default: 3000) */
|
|
155
|
+
llmTimeoutMs?: number;
|
|
156
|
+
/** Language hint for LLM (e.g., "English", "Hindi") */
|
|
157
|
+
languageHint?: string;
|
|
158
|
+
/**
|
|
159
|
+
* Called on every interim (non-final) recognition result with the current
|
|
160
|
+
* partial transcript text. Useful for realβtime UI updates.
|
|
161
|
+
* Does not affect the final transcript or setWordsUpdateCallback.
|
|
162
|
+
*/
|
|
163
|
+
onInterimTranscript?: (text: string) => void;
|
|
164
|
+
}
|
|
165
|
+
type STTLogicOptions = ResetSTTOptions;
|
|
166
|
+
declare class ResetSTTLogic {
|
|
167
|
+
private recognition;
|
|
168
|
+
private isListening;
|
|
169
|
+
private fullTranscript;
|
|
170
|
+
private heardWords;
|
|
171
|
+
private onLog;
|
|
172
|
+
private onTranscript;
|
|
173
|
+
private onWordsUpdate;
|
|
174
|
+
private onMicTimeUpdate;
|
|
175
|
+
private onRestartMetrics;
|
|
176
|
+
private options;
|
|
177
|
+
private micOnTime;
|
|
178
|
+
private sessionDuration;
|
|
179
|
+
private lastTickTime;
|
|
180
|
+
private micTimeInterval;
|
|
181
|
+
private restartCount;
|
|
182
|
+
private isRestarting;
|
|
183
|
+
private isRecognitionRunning;
|
|
184
|
+
private lastInterimTranscript;
|
|
185
|
+
private onInterimTranscriptCallback?;
|
|
186
|
+
private lastInterimSaveTime;
|
|
187
|
+
private interimSaveInterval;
|
|
188
|
+
private lastInterimResultTime;
|
|
189
|
+
private lastSavedLength;
|
|
190
|
+
private transcriptBeforeRestart;
|
|
191
|
+
private sessionStartTranscript;
|
|
192
|
+
private resultHandler?;
|
|
193
|
+
private errorHandler?;
|
|
194
|
+
private endHandler?;
|
|
195
|
+
private startHandler?;
|
|
196
|
+
private sessionId;
|
|
197
|
+
private awaitingRestartFirstResultId;
|
|
198
|
+
private lastWasFinal;
|
|
199
|
+
private restartMetrics;
|
|
200
|
+
private isAutoRestarting;
|
|
201
|
+
private onUserSpeechStart?;
|
|
202
|
+
private onUserSpeechEnd?;
|
|
203
|
+
private fillerManager;
|
|
204
|
+
constructor(onLog: LogCallback, onTranscript: TranscriptCallback, options?: ResetSTTOptions);
|
|
205
|
+
setWordsUpdateCallback(callback: WordUpdateCallback): void;
|
|
206
|
+
setMicTimeUpdateCallback(callback: MicTimeUpdateCallback): void;
|
|
207
|
+
setRestartMetricsCallback(callback: RestartMetricsCallback): void;
|
|
208
|
+
setVadCallbacks(onSpeechStart?: () => void, onSpeechEnd?: () => void): void;
|
|
209
|
+
getSessionDurationMs(): number;
|
|
210
|
+
isInAutoRestart(): boolean;
|
|
211
|
+
getFullTranscript(): string;
|
|
212
|
+
clearTranscript(): void;
|
|
213
|
+
private setupRecognition;
|
|
214
|
+
private waitForEventOnce;
|
|
215
|
+
private startMicTimer;
|
|
216
|
+
private stopMicTimer;
|
|
217
|
+
private saveInterimToFinal;
|
|
218
|
+
private getSuffixToAppend;
|
|
219
|
+
private collapseRepeats;
|
|
220
|
+
private performRestart;
|
|
221
|
+
start(): void;
|
|
222
|
+
stop(): void;
|
|
223
|
+
destroy(): void;
|
|
224
|
+
/**
|
|
225
|
+
* Get the filler manager instance (if enabled)
|
|
226
|
+
*/
|
|
227
|
+
getFillerManager(): FillerManager | null;
|
|
228
|
+
/**
|
|
229
|
+
* Set a custom synthesizer for filler audio generation.
|
|
230
|
+
* Optional - internal TTS is used by default.
|
|
231
|
+
*/
|
|
232
|
+
setFillerSynthesizer(synthesize: (text: string) => Promise<{
|
|
233
|
+
audio: Float32Array;
|
|
234
|
+
sampleRate: number;
|
|
235
|
+
}>): void;
|
|
236
|
+
/**
|
|
237
|
+
* Get the generated short filler text (null if not generated yet)
|
|
238
|
+
*/
|
|
239
|
+
getShortFiller(): string | null;
|
|
240
|
+
/**
|
|
241
|
+
* Get the generated long filler text (null if not generated yet)
|
|
242
|
+
*/
|
|
243
|
+
getLongFiller(): string | null;
|
|
244
|
+
}
|
|
245
|
+
interface CompatibilityInfo {
|
|
246
|
+
/** Whether the Web Speech Recognition API is available in this browser */
|
|
247
|
+
stt: boolean;
|
|
248
|
+
/** Whether Piper TTS is supported (Web Audio API present) */
|
|
249
|
+
tts: boolean;
|
|
250
|
+
/** Detected browser label */
|
|
251
|
+
browser: "Chrome" | "Edge" | "Firefox" | "Safari" | "unknown";
|
|
252
|
+
}
|
|
253
|
+
/**
|
|
254
|
+
* Returns a snapshot of browser feature support relevant to this library.
|
|
255
|
+
* Useful for gating UI elements or showing user-friendly warnings before
|
|
256
|
+
* attempting to start STT or TTS.
|
|
257
|
+
*
|
|
258
|
+
* @example
|
|
259
|
+
* const { stt, browser } = getCompatibilityInfo();
|
|
260
|
+
* if (!stt) alert(`Speech input is not supported in ${browser}.`);
|
|
261
|
+
*/
|
|
262
|
+
declare function getCompatibilityInfo(): CompatibilityInfo;
|
|
263
|
+
declare class STTLogic extends ResetSTTLogic {
|
|
264
|
+
}
|
|
265
|
+
|
|
266
|
+
export { type CompatibilityInfo as C, type MicTimeUpdateCallback as M, type ResetSTTOptions as R, STTLogic as S, VADController as V, type WordUpdateCallback as W, ResetSTTLogic$1 as a, type ResetReason as b, type ResetSTTOptions$1 as c, type ResetStats as d, type VADControllerOptions as e, type STTLogicOptions as f, type RestartMetricsCallback as g, type VadCallbacks as h, getCompatibilityInfo as i };
|