@tryhamster/gerbil 1.0.0-rc.0 → 1.0.0-rc.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +79 -14
- package/dist/auto-update-DsWBBnEk.mjs +3 -0
- package/dist/browser/index.d.mts +401 -5
- package/dist/browser/index.d.mts.map +1 -1
- package/dist/browser/index.mjs +1772 -146
- package/dist/browser/index.mjs.map +1 -1
- package/dist/{chrome-backend-CtwPENIW.mjs → chrome-backend-JEPeM2YE.mjs} +1 -1
- package/dist/{chrome-backend-C5Un08O4.mjs → chrome-backend-Y9F7W5VQ.mjs} +514 -73
- package/dist/chrome-backend-Y9F7W5VQ.mjs.map +1 -0
- package/dist/cli.mjs +3359 -646
- package/dist/cli.mjs.map +1 -1
- package/dist/frameworks/express.d.mts +1 -1
- package/dist/frameworks/express.mjs +3 -3
- package/dist/frameworks/fastify.d.mts +1 -1
- package/dist/frameworks/fastify.mjs +3 -3
- package/dist/frameworks/hono.d.mts +1 -1
- package/dist/frameworks/hono.mjs +3 -3
- package/dist/frameworks/next.d.mts +2 -2
- package/dist/frameworks/next.mjs +3 -3
- package/dist/frameworks/react.d.mts +1 -1
- package/dist/frameworks/trpc.d.mts +1 -1
- package/dist/frameworks/trpc.mjs +3 -3
- package/dist/gerbil-DeQlX_Mt.mjs +5 -0
- package/dist/gerbil-POAz8peb.d.mts +431 -0
- package/dist/gerbil-POAz8peb.d.mts.map +1 -0
- package/dist/gerbil-yoSpRHgv.mjs +1463 -0
- package/dist/gerbil-yoSpRHgv.mjs.map +1 -0
- package/dist/index.d.mts +395 -9
- package/dist/index.d.mts.map +1 -1
- package/dist/index.mjs +8 -6
- package/dist/index.mjs.map +1 -1
- package/dist/integrations/ai-sdk.d.mts +122 -4
- package/dist/integrations/ai-sdk.d.mts.map +1 -1
- package/dist/integrations/ai-sdk.mjs +239 -11
- package/dist/integrations/ai-sdk.mjs.map +1 -1
- package/dist/integrations/langchain.d.mts +132 -2
- package/dist/integrations/langchain.d.mts.map +1 -1
- package/dist/integrations/langchain.mjs +176 -8
- package/dist/integrations/langchain.mjs.map +1 -1
- package/dist/integrations/llamaindex.d.mts +1 -1
- package/dist/integrations/llamaindex.mjs +3 -3
- package/dist/integrations/mcp-client.mjs +4 -4
- package/dist/integrations/mcp-client.mjs.map +1 -1
- package/dist/integrations/mcp.d.mts +2 -2
- package/dist/integrations/mcp.d.mts.map +1 -1
- package/dist/integrations/mcp.mjs +6 -6
- package/dist/{mcp-R8kRLIKb.mjs → mcp-Bitg4sjX.mjs} +10 -37
- package/dist/mcp-Bitg4sjX.mjs.map +1 -0
- package/dist/microphone-D-6y9aiE.mjs +3 -0
- package/dist/{models-DKULvhOr.mjs → models-BAtL8qsA.mjs} +42 -7
- package/dist/models-BAtL8qsA.mjs.map +1 -0
- package/dist/{models-De2-_GmQ.d.mts → models-CE0fBq0U.d.mts} +2 -2
- package/dist/models-CE0fBq0U.d.mts.map +1 -0
- package/dist/{one-liner-BUQR0nqq.mjs → one-liner-B1rmFto6.mjs} +2 -2
- package/dist/{one-liner-BUQR0nqq.mjs.map → one-liner-B1rmFto6.mjs.map} +1 -1
- package/dist/repl-D20JO260.mjs +10 -0
- package/dist/skills/index.d.mts +303 -12
- package/dist/skills/index.d.mts.map +1 -1
- package/dist/skills/index.mjs +6 -6
- package/dist/skills-5DxAV-rn.mjs +1435 -0
- package/dist/skills-5DxAV-rn.mjs.map +1 -0
- package/dist/stt-Bv_dum-R.mjs +433 -0
- package/dist/stt-Bv_dum-R.mjs.map +1 -0
- package/dist/stt-KzSoNvwI.mjs +3 -0
- package/dist/{tools-BsiEE6f2.mjs → tools-IYPrqoek.mjs} +6 -7
- package/dist/{tools-BsiEE6f2.mjs.map → tools-IYPrqoek.mjs.map} +1 -1
- package/dist/tts-5yWeP_I0.mjs +3 -0
- package/dist/tts-DG6denWG.mjs +729 -0
- package/dist/tts-DG6denWG.mjs.map +1 -0
- package/dist/types-s6Py2_DL.d.mts +353 -0
- package/dist/types-s6Py2_DL.d.mts.map +1 -0
- package/dist/{utils-7vXqtq2Q.mjs → utils-CkB4Roi6.mjs} +1 -1
- package/dist/{utils-7vXqtq2Q.mjs.map → utils-CkB4Roi6.mjs.map} +1 -1
- package/docs/ai-sdk.md +137 -21
- package/docs/browser.md +241 -2
- package/docs/memory.md +72 -0
- package/docs/stt.md +494 -0
- package/docs/tts.md +569 -0
- package/docs/vision.md +396 -0
- package/package.json +17 -18
- package/dist/auto-update-BbNHbSU1.mjs +0 -3
- package/dist/chrome-backend-C5Un08O4.mjs.map +0 -1
- package/dist/gerbil-BfnsFWRE.mjs +0 -644
- package/dist/gerbil-BfnsFWRE.mjs.map +0 -1
- package/dist/gerbil-BjW-z7Fq.mjs +0 -5
- package/dist/gerbil-DZ1k3ChC.d.mts +0 -138
- package/dist/gerbil-DZ1k3ChC.d.mts.map +0 -1
- package/dist/mcp-R8kRLIKb.mjs.map +0 -1
- package/dist/models-DKULvhOr.mjs.map +0 -1
- package/dist/models-De2-_GmQ.d.mts.map +0 -1
- package/dist/skills-D3CEpgDc.mjs +0 -630
- package/dist/skills-D3CEpgDc.mjs.map +0 -1
- package/dist/types-BS1N92Jt.d.mts +0 -183
- package/dist/types-BS1N92Jt.d.mts.map +0 -1
package/docs/tts.md
ADDED
|
@@ -0,0 +1,569 @@
|
|
|
1
|
+
# Text-to-Speech
|
|
2
|
+
|
|
3
|
+
Gerbil provides on-device text-to-speech using the [Kokoro-82M](https://huggingface.co/hexgrad/Kokoro-82M) model. Generate natural-sounding speech locally without API keys or internet connection.
|
|
4
|
+
|
|
5
|
+
## Quick Start
|
|
6
|
+
|
|
7
|
+
### Node.js / CLI
|
|
8
|
+
|
|
9
|
+
```typescript
|
|
10
|
+
import { Gerbil } from "@tryhamster/gerbil";
|
|
11
|
+
|
|
12
|
+
const g = new Gerbil();
|
|
13
|
+
|
|
14
|
+
// Generate speech
|
|
15
|
+
const result = await g.speak("Hello, I'm Gerbil!", {
|
|
16
|
+
voice: "af_heart", // American female (highest quality)
|
|
17
|
+
speed: 1.0
|
|
18
|
+
});
|
|
19
|
+
|
|
20
|
+
// result.audio = Float32Array (PCM samples)
|
|
21
|
+
// result.sampleRate = 24000
|
|
22
|
+
// result.duration = seconds
|
|
23
|
+
```
|
|
24
|
+
|
|
25
|
+
### React (Browser)
|
|
26
|
+
|
|
27
|
+
```tsx
|
|
28
|
+
import { useSpeech } from "@tryhamster/gerbil/browser";
|
|
29
|
+
|
|
30
|
+
function SpeechDemo() {
|
|
31
|
+
// Default: Kokoro TTS (24kHz, 28 voices)
|
|
32
|
+
const { speak, stop, isSpeaking, isLoading, listVoices, setVoice } = useSpeech();
|
|
33
|
+
|
|
34
|
+
// Or use Supertonic (44.1kHz, 4 voices, faster)
|
|
35
|
+
// const { speak, listVoices } = useSpeech({ model: "supertonic-66m" });
|
|
36
|
+
|
|
37
|
+
if (isLoading) return <div>Loading TTS model...</div>;
|
|
38
|
+
|
|
39
|
+
return (
|
|
40
|
+
<div>
|
|
41
|
+
<select onChange={e => setVoice(e.target.value)}>
|
|
42
|
+
{listVoices().map(v => (
|
|
43
|
+
<option key={v.id} value={v.id}>{v.name} ({v.language})</option>
|
|
44
|
+
))}
|
|
45
|
+
</select>
|
|
46
|
+
|
|
47
|
+
<button onClick={() => speak("Hello world!")}>
|
|
48
|
+
{isSpeaking ? "Speaking..." : "Speak"}
|
|
49
|
+
</button>
|
|
50
|
+
|
|
51
|
+
{isSpeaking && <button onClick={stop}>Stop</button>}
|
|
52
|
+
</div>
|
|
53
|
+
);
|
|
54
|
+
}
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
### AI SDK
|
|
58
|
+
|
|
59
|
+
```typescript
|
|
60
|
+
import { experimental_generateSpeech as generateSpeech } from "ai";
|
|
61
|
+
import { gerbil } from "@tryhamster/gerbil/ai";
|
|
62
|
+
|
|
63
|
+
const audio = await generateSpeech({
|
|
64
|
+
model: gerbil.speech(),
|
|
65
|
+
text: "Hello from Gerbil!",
|
|
66
|
+
voice: "bf_emma", // British female
|
|
67
|
+
});
|
|
68
|
+
|
|
69
|
+
// audio.audioData = Uint8Array (WAV format)
|
|
70
|
+
```
|
|
71
|
+
|
|
72
|
+
### CLI
|
|
73
|
+
|
|
74
|
+
```bash
|
|
75
|
+
# Speak text with default voice
|
|
76
|
+
gerbil speak "Hello world"
|
|
77
|
+
|
|
78
|
+
# Specify voice and speed
|
|
79
|
+
gerbil speak --voice bf_emma --speed 1.2 "Cheerio!"
|
|
80
|
+
```
|
|
81
|
+
|
|
82
|
+
### Skills
|
|
83
|
+
|
|
84
|
+
```typescript
|
|
85
|
+
import { speak, announce, readAloud } from "@tryhamster/gerbil/skills";
|
|
86
|
+
|
|
87
|
+
// Simple speech
|
|
88
|
+
await speak({ text: "Hello world", voice: "af_heart" });
|
|
89
|
+
|
|
90
|
+
// AI-crafted announcement
|
|
91
|
+
await announce({
|
|
92
|
+
message: "Build completed successfully",
|
|
93
|
+
style: "excited", // casual, formal, excited, calm, urgent
|
|
94
|
+
voice: "af_bella"
|
|
95
|
+
});
|
|
96
|
+
|
|
97
|
+
// Read file aloud
|
|
98
|
+
await readAloud({
|
|
99
|
+
content: "./README.md",
|
|
100
|
+
voice: "bf_emma",
|
|
101
|
+
summarizeIfLong: true // Summarize if > 5000 chars
|
|
102
|
+
});
|
|
103
|
+
```
|
|
104
|
+
|
|
105
|
+
## Available TTS Models
|
|
106
|
+
|
|
107
|
+
Gerbil supports multiple TTS backends:
|
|
108
|
+
|
|
109
|
+
| Model | Size | Sample Rate | Voices | Speed | Notes |
|
|
110
|
+
|-------|------|-------------|--------|-------|-------|
|
|
111
|
+
| `kokoro-82m` | ~330MB | 24000 Hz | 28 | ~100x RT | Default, highest quality |
|
|
112
|
+
| `supertonic-66m` | ~250MB | 44100 Hz | 4 | ~167x RT | Faster, HiFi output |
|
|
113
|
+
|
|
114
|
+
### Selecting a Model
|
|
115
|
+
|
|
116
|
+
```typescript
|
|
117
|
+
// Use default (Kokoro)
|
|
118
|
+
await g.loadTTS();
|
|
119
|
+
|
|
120
|
+
// Use Supertonic (faster, higher sample rate)
|
|
121
|
+
await g.loadTTS({ model: "supertonic-66m" });
|
|
122
|
+
|
|
123
|
+
// List available models
|
|
124
|
+
const models = await g.listTTSModels();
|
|
125
|
+
// [{ id: "kokoro-82m", sampleRate: 24000, voiceCount: 28 }, ...]
|
|
126
|
+
```
|
|
127
|
+
|
|
128
|
+
## Available Voices
|
|
129
|
+
|
|
130
|
+
### Kokoro Voices
|
|
131
|
+
|
|
132
|
+
Kokoro provides 28 voices across American and British English:
|
|
133
|
+
|
|
134
|
+
### American English - Female (Recommended)
|
|
135
|
+
|
|
136
|
+
| Voice ID | Name | Quality | Description |
|
|
137
|
+
|----------|------|---------|-------------|
|
|
138
|
+
| `af_heart` | Heart | A | Highest quality, warm tone |
|
|
139
|
+
| `af_bella` | Bella | A- | Warm and friendly |
|
|
140
|
+
| `af_nicole` | Nicole | B- | Soft and gentle |
|
|
141
|
+
| `af_sarah` | Sarah | C+ | Clear and professional |
|
|
142
|
+
|
|
143
|
+
### American English - Male
|
|
144
|
+
|
|
145
|
+
| Voice ID | Name | Quality | Description |
|
|
146
|
+
|----------|------|---------|-------------|
|
|
147
|
+
| `am_fenrir` | Fenrir | C+ | Best male quality |
|
|
148
|
+
| `am_michael` | Michael | C+ | Warm and friendly |
|
|
149
|
+
| `am_puck` | Puck | C+ | Neutral tone |
|
|
150
|
+
|
|
151
|
+
### British English - Female
|
|
152
|
+
|
|
153
|
+
| Voice ID | Name | Quality | Description |
|
|
154
|
+
|----------|------|---------|-------------|
|
|
155
|
+
| `bf_emma` | Emma | B- | Elegant and clear |
|
|
156
|
+
| `bf_isabella` | Isabella | C | Sophisticated |
|
|
157
|
+
|
|
158
|
+
### British English - Male
|
|
159
|
+
|
|
160
|
+
| Voice ID | Name | Quality | Description |
|
|
161
|
+
|----------|------|---------|-------------|
|
|
162
|
+
| `bm_george` | George | C | Distinguished |
|
|
163
|
+
| `bm_fable` | Fable | C | Storyteller tone |
|
|
164
|
+
|
|
165
|
+
> **Tip:** `af_heart` is the highest quality voice. Start there and experiment with others.
|
|
166
|
+
|
|
167
|
+
### Supertonic Voices
|
|
168
|
+
|
|
169
|
+
Supertonic provides 4 high-quality voices:
|
|
170
|
+
|
|
171
|
+
| Voice | Gender | Description |
|
|
172
|
+
|-------|--------|-------------|
|
|
173
|
+
| `F1` | Female | Clear and natural |
|
|
174
|
+
| `F2` | Female | Warm and expressive |
|
|
175
|
+
| `M1` | Male | Deep and confident |
|
|
176
|
+
| `M2` | Male | Friendly and casual |
|
|
177
|
+
|
|
178
|
+
## API Reference
|
|
179
|
+
|
|
180
|
+
### Gerbil Class Methods
|
|
181
|
+
|
|
182
|
+
```typescript
|
|
183
|
+
class Gerbil {
|
|
184
|
+
// Load TTS model (auto-called by speak if needed)
|
|
185
|
+
async loadTTS(options?: LoadTTSOptions): Promise<void>;
|
|
186
|
+
|
|
187
|
+
// Check if TTS is loaded
|
|
188
|
+
isTTSLoaded(): boolean;
|
|
189
|
+
|
|
190
|
+
// Generate speech
|
|
191
|
+
async speak(text: string, options?: SpeakOptions): Promise<SpeakResult>;
|
|
192
|
+
|
|
193
|
+
// Stream speech (yields chunks as generated)
|
|
194
|
+
async *speakStream(text: string, options?: SpeakOptions): AsyncGenerator<AudioChunk>;
|
|
195
|
+
|
|
196
|
+
// Get available voices
|
|
197
|
+
listVoices(): VoiceInfo[];
|
|
198
|
+
getVoice(voiceId: string): VoiceInfo | null;
|
|
199
|
+
}
|
|
200
|
+
```
|
|
201
|
+
|
|
202
|
+
### SpeakOptions
|
|
203
|
+
|
|
204
|
+
```typescript
|
|
205
|
+
interface SpeakOptions {
|
|
206
|
+
/** Voice ID (default: "af_heart") */
|
|
207
|
+
voice?: string;
|
|
208
|
+
|
|
209
|
+
/** Speed multiplier 0.5-2.0 (default: 1.0) */
|
|
210
|
+
speed?: number;
|
|
211
|
+
|
|
212
|
+
/** Progress callback during loading */
|
|
213
|
+
onProgress?: (info: ProgressInfo) => void;
|
|
214
|
+
|
|
215
|
+
/** Callback for each audio chunk (streaming) */
|
|
216
|
+
onAudioChunk?: (chunk: AudioChunk) => void;
|
|
217
|
+
}
|
|
218
|
+
```
|
|
219
|
+
|
|
220
|
+
### SpeakResult
|
|
221
|
+
|
|
222
|
+
```typescript
|
|
223
|
+
interface SpeakResult {
|
|
224
|
+
/** PCM audio samples (mono, float32, -1 to 1) */
|
|
225
|
+
audio: Float32Array;
|
|
226
|
+
|
|
227
|
+
/** Sample rate (always 24000 for Kokoro) */
|
|
228
|
+
sampleRate: number;
|
|
229
|
+
|
|
230
|
+
/** Duration in seconds */
|
|
231
|
+
duration: number;
|
|
232
|
+
|
|
233
|
+
/** Voice ID used */
|
|
234
|
+
voice: string;
|
|
235
|
+
|
|
236
|
+
/** Generation time in milliseconds */
|
|
237
|
+
totalTime: number;
|
|
238
|
+
}
|
|
239
|
+
```
|
|
240
|
+
|
|
241
|
+
## Playing Audio
|
|
242
|
+
|
|
243
|
+
### Browser (Web Audio API)
|
|
244
|
+
|
|
245
|
+
```typescript
|
|
246
|
+
import { playAudio } from "@tryhamster/gerbil/browser";
|
|
247
|
+
|
|
248
|
+
const result = await gerbil.speak("Hello!");
|
|
249
|
+
|
|
250
|
+
// One-liner playback
|
|
251
|
+
const controller = await playAudio(result.audio, result.sampleRate);
|
|
252
|
+
|
|
253
|
+
// Stop early
|
|
254
|
+
controller.stop();
|
|
255
|
+
|
|
256
|
+
// Wait for completion
|
|
257
|
+
await controller.onEnded;
|
|
258
|
+
```
|
|
259
|
+
|
|
260
|
+
### Node.js (Save to File)
|
|
261
|
+
|
|
262
|
+
```typescript
|
|
263
|
+
import { writeFileSync } from "fs";
|
|
264
|
+
import { execSync } from "child_process";
|
|
265
|
+
|
|
266
|
+
const result = await gerbil.speak("Hello!");
|
|
267
|
+
|
|
268
|
+
// Convert to WAV
|
|
269
|
+
function saveWav(filename: string, audio: Float32Array, sampleRate: number) {
|
|
270
|
+
const buffer = Buffer.alloc(44 + audio.length * 2);
|
|
271
|
+
|
|
272
|
+
// WAV header
|
|
273
|
+
buffer.write("RIFF", 0);
|
|
274
|
+
buffer.writeUInt32LE(36 + audio.length * 2, 4);
|
|
275
|
+
buffer.write("WAVE", 8);
|
|
276
|
+
buffer.write("fmt ", 12);
|
|
277
|
+
buffer.writeUInt32LE(16, 16);
|
|
278
|
+
buffer.writeUInt16LE(1, 20); // PCM
|
|
279
|
+
buffer.writeUInt16LE(1, 22); // Mono
|
|
280
|
+
buffer.writeUInt32LE(sampleRate, 24);
|
|
281
|
+
buffer.writeUInt32LE(sampleRate * 2, 28);
|
|
282
|
+
buffer.writeUInt16LE(2, 32);
|
|
283
|
+
buffer.writeUInt16LE(16, 34);
|
|
284
|
+
buffer.write("data", 36);
|
|
285
|
+
buffer.writeUInt32LE(audio.length * 2, 40);
|
|
286
|
+
|
|
287
|
+
// Audio data
|
|
288
|
+
for (let i = 0; i < audio.length; i++) {
|
|
289
|
+
const s = Math.max(-1, Math.min(1, audio[i]));
|
|
290
|
+
buffer.writeInt16LE(Math.round(s * 32767), 44 + i * 2);
|
|
291
|
+
}
|
|
292
|
+
|
|
293
|
+
writeFileSync(filename, buffer);
|
|
294
|
+
}
|
|
295
|
+
|
|
296
|
+
saveWav("output.wav", result.audio, result.sampleRate);
|
|
297
|
+
|
|
298
|
+
// Play on macOS
|
|
299
|
+
execSync("afplay output.wav");
|
|
300
|
+
|
|
301
|
+
// Play on Linux
|
|
302
|
+
// execSync("aplay output.wav");
|
|
303
|
+
```
|
|
304
|
+
|
|
305
|
+
## Streaming Audio
|
|
306
|
+
|
|
307
|
+
For long text, stream audio chunks as they're generated:
|
|
308
|
+
|
|
309
|
+
```typescript
|
|
310
|
+
// Node.js streaming
|
|
311
|
+
for await (const chunk of gerbil.speakStream("Long paragraph of text...")) {
|
|
312
|
+
console.log(`Chunk ${chunk.index}: ${chunk.samples.length} samples`);
|
|
313
|
+
// Process/play chunk.samples
|
|
314
|
+
|
|
315
|
+
if (chunk.isFinal) {
|
|
316
|
+
console.log("Done!");
|
|
317
|
+
}
|
|
318
|
+
}
|
|
319
|
+
|
|
320
|
+
// Browser streaming with seamless playback
|
|
321
|
+
import { createAudioPlayer } from "@tryhamster/gerbil/browser";
|
|
322
|
+
|
|
323
|
+
const player = createAudioPlayer(24000);
|
|
324
|
+
|
|
325
|
+
for await (const chunk of gerbil.speakStream("Long text...")) {
|
|
326
|
+
player.queue(chunk.samples);
|
|
327
|
+
}
|
|
328
|
+
```
|
|
329
|
+
|
|
330
|
+
## useSpeech Hook Reference
|
|
331
|
+
|
|
332
|
+
```typescript
|
|
333
|
+
const {
|
|
334
|
+
// Actions
|
|
335
|
+
speak, // (text: string, opts?) => Promise<void>
|
|
336
|
+
stop, // () => void
|
|
337
|
+
load, // () => void (manual load trigger)
|
|
338
|
+
|
|
339
|
+
// State
|
|
340
|
+
isLoading, // boolean - model loading
|
|
341
|
+
isSpeaking, // boolean - currently speaking
|
|
342
|
+
isReady, // boolean - model ready
|
|
343
|
+
error, // string | null
|
|
344
|
+
|
|
345
|
+
// Voice control
|
|
346
|
+
listVoices, // () => VoiceInfo[] - voices for current model
|
|
347
|
+
currentVoice, // string
|
|
348
|
+
setVoice, // (id: string) => void
|
|
349
|
+
currentSpeed, // number
|
|
350
|
+
setSpeed, // (speed: number) => void
|
|
351
|
+
|
|
352
|
+
// Model info
|
|
353
|
+
currentModel, // TTSModelId - "kokoro-82m" | "supertonic-66m"
|
|
354
|
+
sampleRate, // number - 24000 (Kokoro) or 44100 (Supertonic)
|
|
355
|
+
|
|
356
|
+
// Loading progress
|
|
357
|
+
loadingProgress, // { status, file?, progress? }
|
|
358
|
+
} = useSpeech({
|
|
359
|
+
model: "kokoro-82m", // TTS model: "kokoro-82m" (default) or "supertonic-66m"
|
|
360
|
+
voice: "af_heart", // default voice (model-specific)
|
|
361
|
+
speed: 1.0, // default speed
|
|
362
|
+
autoLoad: false, // load on first speak() call
|
|
363
|
+
onReady: () => {},
|
|
364
|
+
onError: (err) => {},
|
|
365
|
+
onStart: () => {},
|
|
366
|
+
onEnd: () => {},
|
|
367
|
+
});
|
|
368
|
+
```
|
|
369
|
+
|
|
370
|
+
### Model-Specific Voices
|
|
371
|
+
|
|
372
|
+
Each model has its own set of voices:
|
|
373
|
+
|
|
374
|
+
```typescript
|
|
375
|
+
// Kokoro (default) - 28 voices
|
|
376
|
+
const { listVoices } = useSpeech(); // or model: "kokoro-82m"
|
|
377
|
+
// → af_heart, af_bella, am_fenrir, bf_emma, bm_george, ...
|
|
378
|
+
|
|
379
|
+
// Supertonic - 4 voices
|
|
380
|
+
const { listVoices } = useSpeech({ model: "supertonic-66m" });
|
|
381
|
+
// → F1, F2, M1, M2
|
|
382
|
+
```
|
|
383
|
+
|
|
384
|
+
**Note:** Voice IDs are model-specific. Using a Kokoro voice with Supertonic (or vice versa) will throw an error.
|
|
385
|
+
|
|
386
|
+
## Performance
|
|
387
|
+
|
|
388
|
+
| Metric | Value |
|
|
389
|
+
|--------|-------|
|
|
390
|
+
| Model size | ~330MB |
|
|
391
|
+
| First load | 3-10s (downloads model) |
|
|
392
|
+
| Cached load | <1s |
|
|
393
|
+
| Generation speed | ~8x realtime on M1 Mac |
|
|
394
|
+
| Sample rate | 24kHz |
|
|
395
|
+
| Audio format | Mono Float32 PCM |
|
|
396
|
+
|
|
397
|
+
## Troubleshooting
|
|
398
|
+
|
|
399
|
+
### Audio sounds like gibberish
|
|
400
|
+
|
|
401
|
+
This happens when using raw transformers.js without proper phoneme conversion. Gerbil uses `kokoro-js` which handles grapheme-to-phoneme (G2P) conversion automatically.
|
|
402
|
+
|
|
403
|
+
### Audio is too quiet
|
|
404
|
+
|
|
405
|
+
Kokoro output can be quiet. The audio is normalized by default, but if you're processing raw audio, ensure proper normalization:
|
|
406
|
+
|
|
407
|
+
```typescript
|
|
408
|
+
function normalizeAudio(audio: Float32Array, targetRms = 0.15): Float32Array {
|
|
409
|
+
const currentRms = Math.sqrt(audio.reduce((a, b) => a + b * b, 0) / audio.length);
|
|
410
|
+
const gain = targetRms / currentRms;
|
|
411
|
+
return audio.map(s => Math.max(-1, Math.min(1, s * gain)));
|
|
412
|
+
}
|
|
413
|
+
```
|
|
414
|
+
|
|
415
|
+
### Browser autoplay blocked
|
|
416
|
+
|
|
417
|
+
Browsers require user interaction before playing audio. Trigger speech from a click handler:
|
|
418
|
+
|
|
419
|
+
```tsx
|
|
420
|
+
<button onClick={() => speak("Hello!")}>Speak</button>
|
|
421
|
+
```
|
|
422
|
+
|
|
423
|
+
### Voice not found
|
|
424
|
+
|
|
425
|
+
Use `listVoices()` to see available voice IDs. Voice IDs follow the pattern `{language}{gender}_{name}`:
|
|
426
|
+
- `af_` = American female
|
|
427
|
+
- `am_` = American male
|
|
428
|
+
- `bf_` = British female
|
|
429
|
+
- `bm_` = British male
|
|
430
|
+
|
|
431
|
+
## Model Info
|
|
432
|
+
|
|
433
|
+
### Kokoro-82M
|
|
434
|
+
|
|
435
|
+
Uses [Kokoro-82M](https://huggingface.co/hexgrad/Kokoro-82M) via the [ONNX export](https://huggingface.co/onnx-community/Kokoro-82M-v1.0-ONNX) and [kokoro-js](https://www.npmjs.com/package/kokoro-js) wrapper.
|
|
436
|
+
|
|
437
|
+
- **Architecture:** StyleTTS2-based
|
|
438
|
+
- **Parameters:** 82M
|
|
439
|
+
- **Languages:** English (US + UK)
|
|
440
|
+
- **License:** Apache 2.0
|
|
441
|
+
|
|
442
|
+
### Supertonic-66M
|
|
443
|
+
|
|
444
|
+
Uses [Supertonic-TTS](https://huggingface.co/onnx-community/Supertonic-TTS-ONNX) via transformers.js pipeline.
|
|
445
|
+
|
|
446
|
+
- **Parameters:** 66M
|
|
447
|
+
- **Languages:** English
|
|
448
|
+
- **Sample Rate:** 44.1kHz (HiFi)
|
|
449
|
+
- **Speed:** ~167x realtime
|
|
450
|
+
- **License:** Apache 2.0
|
|
451
|
+
|
|
452
|
+
---
|
|
453
|
+
|
|
454
|
+
## Adding New TTS Models (Developer Guide)
|
|
455
|
+
|
|
456
|
+
To add a new TTS model to Gerbil, implement it in these locations:
|
|
457
|
+
|
|
458
|
+
### 1. Core TTS Class (`src/core/tts.ts`)
|
|
459
|
+
|
|
460
|
+
Add your model to the TTS registry and create a class that implements the TTS interface:
|
|
461
|
+
|
|
462
|
+
```typescript
|
|
463
|
+
// 1. Add voice definitions
|
|
464
|
+
export const MY_MODEL_VOICES: VoiceInfo[] = [
|
|
465
|
+
{ id: "voice1", name: "Voice 1", gender: "female", language: "en", description: "..." },
|
|
466
|
+
// ...
|
|
467
|
+
];
|
|
468
|
+
|
|
469
|
+
// 2. Add to TTS_MODELS registry
|
|
470
|
+
export const TTS_MODELS: Record<string, TTSModelConfig> = {
|
|
471
|
+
// ...existing models...
|
|
472
|
+
"my-model-50m": {
|
|
473
|
+
id: "my-model-50m",
|
|
474
|
+
repo: "hf-org/my-model-onnx",
|
|
475
|
+
description: "My Model 50M - Description",
|
|
476
|
+
size: "~150MB",
|
|
477
|
+
sampleRate: 22050,
|
|
478
|
+
voices: MY_MODEL_VOICES,
|
|
479
|
+
defaultVoice: "voice1",
|
|
480
|
+
languages: ["en"],
|
|
481
|
+
},
|
|
482
|
+
};
|
|
483
|
+
|
|
484
|
+
// 3. Create TTS class (see KokoroTTS or SupertonicTTS as examples)
|
|
485
|
+
export class MyModelTTS {
|
|
486
|
+
async load(options: LoadTTSOptions): Promise<void> { /* ... */ }
|
|
487
|
+
async speak(text: string, options: SpeakOptions): Promise<SpeakResult> { /* ... */ }
|
|
488
|
+
listVoices(): VoiceInfo[] { /* ... */ }
|
|
489
|
+
// ...
|
|
490
|
+
}
|
|
491
|
+
|
|
492
|
+
// 4. Update createTTS factory
|
|
493
|
+
export function createTTS(modelId: string = "kokoro-82m"): TTSBackend {
|
|
494
|
+
if (modelId.startsWith("my-model")) {
|
|
495
|
+
return new MyModelTTS(modelId);
|
|
496
|
+
}
|
|
497
|
+
// ...existing logic...
|
|
498
|
+
}
|
|
499
|
+
```
|
|
500
|
+
|
|
501
|
+
### 2. Browser Hook (`src/browser/index.ts`)
|
|
502
|
+
|
|
503
|
+
Add browser support in the useSpeech hook:
|
|
504
|
+
|
|
505
|
+
```typescript
|
|
506
|
+
// 1. Add voice definitions for browser
|
|
507
|
+
const MY_MODEL_BROWSER_VOICES: BrowserVoiceInfo[] = [
|
|
508
|
+
{ id: "voice1", name: "Voice 1", gender: "female", language: "en", description: "..." },
|
|
509
|
+
];
|
|
510
|
+
|
|
511
|
+
// 2. Update TTS_MODELS in browser
|
|
512
|
+
const TTS_MODELS: Record<TTSModelId, {...}> = {
|
|
513
|
+
// ...existing models...
|
|
514
|
+
"my-model-50m": {
|
|
515
|
+
repo: "hf-org/my-model-onnx",
|
|
516
|
+
defaultVoice: "voice1",
|
|
517
|
+
sampleRate: 22050,
|
|
518
|
+
voices: MY_MODEL_BROWSER_VOICES,
|
|
519
|
+
},
|
|
520
|
+
};
|
|
521
|
+
|
|
522
|
+
// 3. Update TTSModelId type
|
|
523
|
+
export type TTSModelId = "kokoro-82m" | "supertonic-66m" | "my-model-50m";
|
|
524
|
+
|
|
525
|
+
// 4. Add loading logic in useSpeech useEffect
|
|
526
|
+
if (modelId === "my-model-50m") {
|
|
527
|
+
// Load using appropriate method (pipeline, custom loader, etc.)
|
|
528
|
+
}
|
|
529
|
+
```
|
|
530
|
+
|
|
531
|
+
### 3. Gerbil Class (`src/core/gerbil.ts`)
|
|
532
|
+
|
|
533
|
+
Update the main Gerbil class to support the new model:
|
|
534
|
+
|
|
535
|
+
```typescript
|
|
536
|
+
// Add type import
|
|
537
|
+
type MyModelTTSType = import("./tts.js").MyModelTTS;
|
|
538
|
+
|
|
539
|
+
// Update TTSBackendType union
|
|
540
|
+
type TTSBackendType = KokoroTTSType | SupertonicTTSType | MyModelTTSType;
|
|
541
|
+
```
|
|
542
|
+
|
|
543
|
+
### 4. Integration Points
|
|
544
|
+
|
|
545
|
+
Update these integration files if applicable:
|
|
546
|
+
|
|
547
|
+
- **AI SDK** (`src/integrations/ai-sdk.ts`): Add speech model support
|
|
548
|
+
- **Skills** (`src/skills/builtin/speak.ts`, etc.): Update voice enums
|
|
549
|
+
- **CLI** (`src/cli/index.ts`): Add model to CLI options
|
|
550
|
+
|
|
551
|
+
### 5. Documentation
|
|
552
|
+
|
|
553
|
+
Update documentation:
|
|
554
|
+
|
|
555
|
+
- `docs/tts.md`: Add model to tables and examples
|
|
556
|
+
- `.cursor/rules/gerbil/tts.mdc`: Update implementation patterns
|
|
557
|
+
|
|
558
|
+
### File Summary
|
|
559
|
+
|
|
560
|
+
| File | Purpose |
|
|
561
|
+
|------|---------|
|
|
562
|
+
| `src/core/tts.ts` | Core TTS classes, voice registry, model config |
|
|
563
|
+
| `src/browser/index.ts` | React hooks (useSpeech, useVoiceChat) |
|
|
564
|
+
| `src/core/gerbil.ts` | Main Gerbil class integration |
|
|
565
|
+
| `src/integrations/ai-sdk.ts` | AI SDK v5 speech model |
|
|
566
|
+
| `src/skills/builtin/speak.ts` | Speak skill voice options |
|
|
567
|
+
| `docs/tts.md` | User documentation |
|
|
568
|
+
| `.cursor/rules/gerbil/tts.mdc` | Developer patterns |
|
|
569
|
+
|