speech-to-speech 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md ADDED
@@ -0,0 +1,591 @@
1
+ # stt-tts-lib
2
+
3
+ TypeScript utilities for speech-to-text (STT) and text-to-speech (TTS) in the browser. Ships ESM/CJS bundles with full TypeScript declarations.
4
+
5
+ **Features:**
6
+
7
+ - 🎤 **STT**: Browser-native speech recognition with session management
8
+ - 🔊 **TTS**: Piper neural TTS with automatic model downloading
9
+ - ✅ **Zero Config**: No manual ONNX setup required - everything is handled automatically
10
+ - 📦 **Small**: ~135KB package size
11
+
12
+ ## Quick Start
13
+
14
+ ### Installation
15
+
16
+ ```bash
17
+ npm install stt-tts-lib
18
+ ```
19
+
20
+ ### Basic Usage
21
+
22
+ ```typescript
23
+ import { STTLogic, TTSLogic, createAudioPlayer } from "stt-tts-lib";
24
+
25
+ // Speech-to-Text
26
+ const stt = new STTLogic(
27
+ (msg, level) => console.log(`[${level}] ${msg}`),
28
+ (transcript) => console.log("Transcript:", transcript)
29
+ );
30
+ stt.start();
31
+
32
+ // Text-to-Speech
33
+ const synthesizer = new TTSLogic({
34
+ voiceId: "en_US-hfc_female-medium",
35
+ });
36
+ await synthesizer.initialize();
37
+ const player = createAudioPlayer({ sampleRate: 22050 });
38
+
39
+ const result = await synthesizer.synthesize("Hello world!");
40
+ await player.play(result.audio, result.sampleRate);
41
+ ```
42
+
43
+ ## Vite Configuration (Required)
44
+
45
+ For Vite-based projects, add this configuration to `vite.config.js`:
46
+
47
+ ```javascript
48
+ import { defineConfig } from "vite";
49
+
50
+ export default defineConfig({
51
+ server: {
52
+ headers: {
53
+ // Required for SharedArrayBuffer (WASM multi-threading)
54
+ "Cross-Origin-Opener-Policy": "same-origin",
55
+ "Cross-Origin-Embedder-Policy": "require-corp",
56
+ },
57
+ },
58
+ optimizeDeps: {
59
+ // Force pre-bundling for dev server compatibility
60
+ include: ["onnxruntime-web", "@realtimex/piper-tts-web"],
61
+ esbuildOptions: {
62
+ target: "esnext",
63
+ },
64
+ },
65
+ });
66
+ ```
67
+
68
+ ## Next.js Configuration (Required)
69
+
70
+ For Next.js projects, you need additional configuration since this library uses browser-only APIs (Web Speech, Web Audio, ONNX WASM).
71
+
72
+ ### 1. Configure Headers in `next.config.js`
73
+
74
+ ```javascript
75
+ /** @type {import('next').NextConfig} */
76
+ const nextConfig = {
77
+ // Required for SharedArrayBuffer (WASM multi-threading)
78
+ async headers() {
79
+ return [
80
+ {
81
+ source: "/(.*)",
82
+ headers: [
83
+ {
84
+ key: "Cross-Origin-Opener-Policy",
85
+ value: "same-origin",
86
+ },
87
+ {
88
+ key: "Cross-Origin-Embedder-Policy",
89
+ value: "require-corp",
90
+ },
91
+ ],
92
+ },
93
+ ];
94
+ },
95
+ // Exclude ONNX from webpack bundling (it loads WASM dynamically)
96
+ webpack: (config, { isServer }) => {
97
+ if (!isServer) {
98
+ config.resolve.fallback = {
99
+ ...config.resolve.fallback,
100
+ fs: false,
101
+ path: false,
102
+ };
103
+ }
104
+ return config;
105
+ },
106
+ };
107
+
108
+ module.exports = nextConfig;
109
+ ```
110
+
111
+ ### 2. Client-Side Only Usage
112
+
113
+ Since this library uses browser APIs, you **must** ensure it only runs on the client:
114
+
115
+ **Option A: Dynamic Import (Recommended)**
116
+
117
+ ```typescript
118
+ "use client";
119
+
120
+ import { useEffect, useState, useRef } from "react";
121
+ import type { TTSLogic, AudioPlayer } from "stt-tts-lib";
122
+
123
+ export default function SpeechComponent() {
124
+ const [isReady, setIsReady] = useState(false);
125
+ const ttsRef = useRef<TTSLogic | null>(null);
126
+ const playerRef = useRef<AudioPlayer | null>(null);
127
+
128
+ useEffect(() => {
129
+ // Dynamic import to avoid SSR
130
+ async function initTTS() {
131
+ const { TTSLogic, createAudioPlayer } = await import("stt-tts-lib");
132
+
133
+ ttsRef.current = new TTSLogic({
134
+ voiceId: "en_US-hfc_female-medium",
135
+ });
136
+ await ttsRef.current.initialize();
137
+
138
+ playerRef.current = createAudioPlayer({ sampleRate: 22050 });
139
+ setIsReady(true);
140
+ }
141
+
142
+ initTTS();
143
+
144
+ return () => {
145
+ ttsRef.current?.dispose();
146
+ playerRef.current?.close();
147
+ };
148
+ }, []);
149
+
150
+ const speak = async (text: string) => {
151
+ if (!ttsRef.current || !playerRef.current) return;
152
+ const result = await ttsRef.current.synthesize(text);
153
+ await playerRef.current.play(result.audio, result.sampleRate);
154
+ };
155
+
156
+ return (
157
+ <button onClick={() => speak("Hello from Next.js!")} disabled={!isReady}>
158
+ {isReady ? "Speak" : "Loading..."}
159
+ </button>
160
+ );
161
+ }
162
+ ```
163
+
164
+ **Option B: Using `next/dynamic` with `ssr: false`**
165
+
166
+ ```typescript
167
+ // components/SpeechWrapper.tsx
168
+ "use client";
169
+
170
+ import dynamic from "next/dynamic";
171
+
172
+ const SpeechComponent = dynamic(() => import("./SpeechComponent"), {
173
+ ssr: false,
174
+ loading: () => <p>Loading speech features...</p>,
175
+ });
176
+
177
+ export default SpeechComponent;
178
+ ```
179
+
180
+ ### 3. Complete Next.js Example with STT + TTS
181
+
182
+ ```typescript
183
+ "use client";
184
+
185
+ import { useEffect, useState, useRef, useCallback } from "react";
186
+ import type { STTLogic, TTSLogic, AudioPlayer } from "stt-tts-lib";
187
+
188
+ export default function VoiceChat() {
189
+ const [transcript, setTranscript] = useState("");
190
+ const [isListening, setIsListening] = useState(false);
191
+ const [isReady, setIsReady] = useState(false);
192
+
193
+ const sttRef = useRef<STTLogic | null>(null);
194
+ const ttsRef = useRef<TTSLogic | null>(null);
195
+ const playerRef = useRef<AudioPlayer | null>(null);
196
+
197
+ useEffect(() => {
198
+ async function init() {
199
+ const { STTLogic, TTSLogic, createAudioPlayer } = await import(
200
+ "stt-tts-lib"
201
+ );
202
+
203
+ // Initialize TTS
204
+ ttsRef.current = new TTSLogic({
205
+ voiceId: "en_US-hfc_female-medium",
206
+ });
207
+ await ttsRef.current.initialize();
208
+ playerRef.current = createAudioPlayer({ sampleRate: 22050 });
209
+
210
+ // Initialize STT
211
+ sttRef.current = new STTLogic(
212
+ (msg, level) => console.log(`[STT ${level}]`, msg),
213
+ (text) => setTranscript(text)
214
+ );
215
+
216
+ setIsReady(true);
217
+ }
218
+
219
+ init();
220
+
221
+ return () => {
222
+ sttRef.current?.destroy();
223
+ ttsRef.current?.dispose();
224
+ playerRef.current?.close();
225
+ };
226
+ }, []);
227
+
228
+ const toggleListening = useCallback(() => {
229
+ if (!sttRef.current) return;
230
+ if (isListening) {
231
+ sttRef.current.stop();
232
+ } else {
233
+ sttRef.current.start();
234
+ }
235
+ setIsListening(!isListening);
236
+ }, [isListening]);
237
+
238
+ const speak = async () => {
239
+ if (!ttsRef.current || !playerRef.current || !transcript) return;
240
+ const result = await ttsRef.current.synthesize(transcript);
241
+ await playerRef.current.play(result.audio, result.sampleRate);
242
+ };
243
+
244
+ if (!isReady) return <p>Loading speech features...</p>;
245
+
246
+ return (
247
+ <div>
248
+ <button onClick={toggleListening}>
249
+ {isListening ? "Stop Listening" : "Start Listening"}
250
+ </button>
251
+ <p>Transcript: {transcript}</p>
252
+ <button onClick={speak} disabled={!transcript}>
253
+ Read Aloud
254
+ </button>
255
+ </div>
256
+ );
257
+ }
258
+ ```
259
+
260
+ ## Exports
261
+
262
+ ```typescript
263
+ // Main bundle (STT + TTS)
264
+ import { STTLogic, TTSLogic, createAudioPlayer } from "stt-tts-lib";
265
+
266
+ // STT only
267
+ import { STTLogic, ResetSTTLogic, VADController } from "stt-tts-lib/stt";
268
+
269
+ // TTS only
270
+ import { TTSLogic, createAudioPlayer } from "stt-tts-lib/tts";
271
+ ```
272
+
273
+ ## API Reference
274
+
275
+ ### STT (Speech-to-Text)
276
+
277
+ #### `STTLogic`
278
+
279
+ Main speech recognition controller with session management.
280
+
281
+ ```typescript
282
+ const stt = new STTLogic(
283
+ // Log callback
284
+ (message: string, level?: string) => void,
285
+ // Transcript callback
286
+ (transcript: string) => void,
287
+ // Options
288
+ {
289
+ sessionDurationMs?: number, // Session duration (default: 30000)
290
+ interimSaveIntervalMs?: number, // Interim save interval (default: 5000)
291
+ preserveTranscriptOnStart?: boolean,
292
+ }
293
+ );
294
+
295
+ // Methods
296
+ stt.start(); // Start listening
297
+ stt.stop(); // Stop listening
298
+ stt.destroy(); // Cleanup resources
299
+ stt.getFullTranscript(); // Get accumulated transcript
300
+ stt.clearTranscript(); // Clear transcript
301
+ stt.setWordsUpdateCallback((words) => {}); // Listen for word updates
302
+ ```
303
+
304
+ #### `ResetSTTLogic`
305
+
306
+ Low-level reset logic for custom STT implementations.
307
+
308
+ ```typescript
309
+ const reset = new ResetSTTLogic({
310
+ maxSilenceMs: 1500,
311
+ maxUtteranceMs: 8000,
312
+ onReset: (reason, stats) => console.log("reset", reason, stats),
313
+ });
314
+ ```
315
+
316
+ #### `VADController`
317
+
318
+ Voice Activity Detection controller.
319
+
320
+ ```typescript
321
+ const vad = new VADController({
322
+ activation: -35,
323
+ release: -45,
324
+ hangoverFrames: 10,
325
+ });
326
+ ```
327
+
328
+ ### TTS (Text-to-Speech)
329
+
330
+ #### `TTSLogic`
331
+
332
+ Piper TTS synthesizer class. Voice models are downloaded automatically on first use.
333
+
334
+ ```typescript
335
+ const synthesizer = new TTSLogic({
336
+ voiceId: "en_US-hfc_female-medium", // Piper voice ID
337
+ });
338
+ await synthesizer.initialize();
339
+
340
+ // Synthesize text to audio
341
+ const result = await synthesizer.synthesize("Hello world!");
342
+ // result.audio: Float32Array
343
+ // result.audioBlob: Blob (WAV format)
344
+ // result.sampleRate: number
345
+ // result.duration: number (seconds)
346
+
347
+ // Get WAV blob only (faster, no decoding)
348
+ const blob = await synthesizer.synthesizeToBlob("Hello world!");
349
+
350
+ // Cleanup
351
+ await synthesizer.dispose();
352
+ ```
353
+
354
+ #### `createAudioPlayer(config)`
355
+
356
+ Creates an audio player for playback.
357
+
358
+ ```typescript
359
+ const player = createAudioPlayer({
360
+ sampleRate: 22050,
361
+ });
362
+
363
+ // Play audio
364
+ await player.play(audioData, sampleRate);
365
+
366
+ // Stop playback
367
+ player.stop();
368
+
369
+ // Cleanup
370
+ await player.close();
371
+ ```
372
+
373
+ ### Available Piper Voices
374
+
375
+ Voice models are downloaded automatically from CDN on first use (~20-80MB per voice).
376
+
377
+ | Voice ID | Language | Description |
378
+ | ------------------------- | ------------ | ------------------------------ |
379
+ | `en_US-hfc_female-medium` | English (US) | Female, medium quality |
380
+ | `en_US-lessac-medium` | English (US) | Neutral, medium quality |
381
+ | `en_US-lessac-low` | English (US) | Neutral, low quality (smaller) |
382
+ | `en_US-lessac-high` | English (US) | Neutral, high quality (larger) |
383
+ | `en_GB-alba-medium` | English (UK) | British accent |
384
+ | `de_DE-thorsten-medium` | German | German voice |
385
+ | `fr_FR-upmc-medium` | French | French voice |
386
+
387
+ See [Piper Voices](https://rhasspy.github.io/piper-samples/) for the complete list.
388
+
389
+ ## Usage Examples
390
+
391
+ ### Complete STT Example
392
+
393
+ ```typescript
394
+ import { STTLogic } from "stt-tts-lib";
395
+
396
+ // Create STT instance
397
+ const stt = new STTLogic(
398
+ (message, level) => console.log(`[STT ${level}] ${message}`),
399
+ (transcript) => {
400
+ document.getElementById("output")!.textContent = transcript;
401
+ },
402
+ {
403
+ sessionDurationMs: 30000,
404
+ interimSaveIntervalMs: 5000,
405
+ }
406
+ );
407
+
408
+ // Listen for individual words
409
+ stt.setWordsUpdateCallback((words) => {
410
+ console.log("Heard words:", words);
411
+ });
412
+
413
+ // Start listening
414
+ stt.start();
415
+
416
+ // Stop after 10 seconds
417
+ setTimeout(() => {
418
+ stt.stop();
419
+ console.log("Final transcript:", stt.getFullTranscript());
420
+ }, 10000);
421
+
422
+ // Cleanup on page unload
423
+ window.addEventListener("beforeunload", () => stt.destroy());
424
+ ```
425
+
426
+ ### Complete TTS Example
427
+
428
+ ```typescript
429
+ import { TTSLogic, createAudioPlayer } from "stt-tts-lib";
430
+
431
+ async function speak(text: string) {
432
+ // Initialize (downloads voice model on first use)
433
+ const synthesizer = new TTSLogic({
434
+ voiceId: "en_US-hfc_female-medium",
435
+ });
436
+ await synthesizer.initialize();
437
+
438
+ const player = createAudioPlayer({ sampleRate: 22050 });
439
+
440
+ // Synthesize and play
441
+ const result = await synthesizer.synthesize(text);
442
+ console.log(`Generated ${result.duration.toFixed(2)}s of audio`);
443
+
444
+ await player.play(result.audio, result.sampleRate);
445
+
446
+ // Cleanup
447
+ await synthesizer.dispose();
448
+ await player.close();
449
+ }
450
+
451
+ // Usage
452
+ speak("Hello! This is Piper text-to-speech running in your browser.");
453
+ ```
454
+
455
+ ### Combined STT + TTS Example
456
+
457
+ ```typescript
458
+ import {
459
+ STTLogic,
460
+ TTSLogic,
461
+ createAudioPlayer,
462
+ type AudioPlayer,
463
+ } from "stt-tts-lib";
464
+
465
+ let stt: STTLogic | null = null;
466
+ let tts: TTSLogic | null = null;
467
+ let player: AudioPlayer | null = null;
468
+
469
+ async function init() {
470
+ // Initialize TTS
471
+ tts = new TTSLogic({
472
+ voiceId: "en_US-hfc_female-medium",
473
+ });
474
+ await tts.initialize();
475
+ player = createAudioPlayer({ sampleRate: 22050 });
476
+
477
+ // Initialize STT with echo response
478
+ stt = new STTLogic(
479
+ (msg) => console.log(msg),
480
+ async (transcript) => {
481
+ console.log("You said:", transcript);
482
+
483
+ // Echo back what was heard
484
+ if (tts && player && transcript.trim()) {
485
+ const result = await tts.synthesize(`You said: ${transcript}`);
486
+ await player.play(result.audio, result.sampleRate);
487
+ }
488
+ }
489
+ );
490
+ }
491
+
492
+ // Start listening
493
+ function startListening() {
494
+ stt?.start();
495
+ }
496
+
497
+ // Stop listening
498
+ function stopListening() {
499
+ stt?.stop();
500
+ }
501
+
502
+ // Cleanup
503
+ function cleanup() {
504
+ stt?.destroy();
505
+ tts?.dispose();
506
+ player?.close();
507
+ }
508
+ ```
509
+
510
+ ## Build & Scripts
511
+
512
+ ```bash
513
+ npm run build # Bundle with tsup (ESM/CJS + d.ts) into dist/
514
+ npm run lint # Type-check with tsc --noEmit
515
+ npm run clean # Remove dist/
516
+ ```
517
+
518
+ ## Browser Compatibility
519
+
520
+ | Feature | Chrome | Firefox | Safari | Edge |
521
+ | ------------------------ | ------ | ------- | ------ | ---- |
522
+ | STT (Speech Recognition) | ✅ | ❌ | ✅ | ✅ |
523
+ | TTS (Piper ONNX) | ✅ | ✅ | ✅ | ✅ |
524
+ | Web Audio API | ✅ | ✅ | ✅ | ✅ |
525
+
526
+ **Note:** Speech Recognition API requires Chrome, Safari, or Edge. Firefox does not support the Web Speech API.
527
+
528
+ ## Troubleshooting
529
+
530
+ ### TTS Issues
531
+
532
+ | Issue | Solution |
533
+ | -------------------- | --------------------------------------------------------------------------------------- |
534
+ | "Voice not found" | Check voice ID spelling. Use `en_US-hfc_female-medium` for testing. |
535
+ | Slow first synthesis | Normal - voice model (~20MB) downloads on first use. Subsequent calls use cached model. |
536
+ | No audio output | Ensure browser supports Web Audio API. Check volume and audio permissions. |
537
+ | CORS errors | Ensure Vite config has proper COOP/COEP headers (see above). |
538
+
539
+ ### STT Issues
540
+
541
+ | Issue | Solution |
542
+ | ---------------------------------- | ------------------------------------------------------------------------------------------ |
543
+ | "Speech Recognition not supported" | Use Chrome, Safari, or Edge. Firefox doesn't support Web Speech API. |
544
+ | No transcript | Check microphone permissions. Ensure `stt.start()` was called. |
545
+ | Transcript stops | Browser sessions timeout after ~30s. Library auto-restarts, but check `sessionDurationMs`. |
546
+
547
+ ### Dev Server Issues (Vite)
548
+
549
+ | Issue | Solution |
550
+ | ----------------------------------------------- | ------------------------------------------------------ |
551
+ | "Module externalized for browser compatibility" | Add `optimizeDeps.include` in Vite config (see above). |
552
+ | WASM loading errors | Ensure COOP/COEP headers are set in Vite config. |
553
+ | Works in production but not dev | Clear `.vite` cache: `rm -rf node_modules/.vite` |
554
+
555
+ ### Next.js Issues
556
+
557
+ | Issue | Solution |
558
+ | ------------------------- | --------------------------------------------------------------------------- |
559
+ | "window is not defined" | Use dynamic import inside `useEffect` or `next/dynamic` with `ssr: false`. |
560
+ | "document is not defined" | Same as above - library must only run on client side. |
561
+ | SharedArrayBuffer errors | Ensure COOP/COEP headers are set in `next.config.js` (see Next.js section). |
562
+ | WASM file not loading | Check browser console for CORS errors. Verify headers config is applied. |
563
+ | Hydration mismatch | Wrap speech components with `dynamic(() => import(...), { ssr: false })`. |
564
+
565
+ ## Built With
566
+
567
+ This library leverages the following open-source tools and technologies:
568
+
569
+ ### Core Technologies
570
+
571
+ - **[ONNX Runtime Web](https://github.com/microsoft/onnxruntime)** - Microsoft's cross-platform ML inference engine for running neural networks in the browser via WebAssembly
572
+ - **[Piper TTS](https://github.com/rhasspy/piper)** - Fast, local neural text-to-speech system by Rhasspy using ONNX models
573
+ - **[@realtimex/piper-tts-web](https://github.com/synesthesiam/piper)** - Browser-compatible wrapper for Piper TTS models
574
+
575
+ ### Browser APIs
576
+
577
+ - **[Web Speech API](https://developer.mozilla.org/en-US/docs/Web/API/Web_Speech_API)** - Native browser speech recognition (Chrome, Safari, Edge)
578
+ - **[Web Audio API](https://developer.mozilla.org/en-US/docs/Web/API/Web_Audio_API)** - High-level JavaScript API for audio processing and synthesis
579
+
580
+ ### AI Models
581
+
582
+ - **[Piper Neural Voices](https://rhasspy.github.io/piper-samples/)** - High-quality multilingual TTS models trained using VITS (Conditional Variational Autoencoder with Adversarial Learning for End-to-End Text-to-Speech)
583
+ - Models support 40+ languages
584
+ - Quality ranges from low (~10MB) to high (~80MB) per voice
585
+ - Models automatically downloaded from CDN on first use
586
+
587
+
588
+
589
+ ## License
590
+
591
+ MIT