@tryhamster/gerbil 1.0.0-rc.0 → 1.0.0-rc.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (94) hide show
  1. package/README.md +79 -14
  2. package/dist/auto-update-DsWBBnEk.mjs +3 -0
  3. package/dist/browser/index.d.mts +401 -5
  4. package/dist/browser/index.d.mts.map +1 -1
  5. package/dist/browser/index.mjs +1772 -146
  6. package/dist/browser/index.mjs.map +1 -1
  7. package/dist/{chrome-backend-CtwPENIW.mjs → chrome-backend-JEPeM2YE.mjs} +1 -1
  8. package/dist/{chrome-backend-C5Un08O4.mjs → chrome-backend-Y9F7W5VQ.mjs} +514 -73
  9. package/dist/chrome-backend-Y9F7W5VQ.mjs.map +1 -0
  10. package/dist/cli.mjs +3359 -646
  11. package/dist/cli.mjs.map +1 -1
  12. package/dist/frameworks/express.d.mts +1 -1
  13. package/dist/frameworks/express.mjs +3 -3
  14. package/dist/frameworks/fastify.d.mts +1 -1
  15. package/dist/frameworks/fastify.mjs +3 -3
  16. package/dist/frameworks/hono.d.mts +1 -1
  17. package/dist/frameworks/hono.mjs +3 -3
  18. package/dist/frameworks/next.d.mts +2 -2
  19. package/dist/frameworks/next.mjs +3 -3
  20. package/dist/frameworks/react.d.mts +1 -1
  21. package/dist/frameworks/trpc.d.mts +1 -1
  22. package/dist/frameworks/trpc.mjs +3 -3
  23. package/dist/gerbil-DeQlX_Mt.mjs +5 -0
  24. package/dist/gerbil-POAz8peb.d.mts +431 -0
  25. package/dist/gerbil-POAz8peb.d.mts.map +1 -0
  26. package/dist/gerbil-yoSpRHgv.mjs +1463 -0
  27. package/dist/gerbil-yoSpRHgv.mjs.map +1 -0
  28. package/dist/index.d.mts +395 -9
  29. package/dist/index.d.mts.map +1 -1
  30. package/dist/index.mjs +8 -6
  31. package/dist/index.mjs.map +1 -1
  32. package/dist/integrations/ai-sdk.d.mts +122 -4
  33. package/dist/integrations/ai-sdk.d.mts.map +1 -1
  34. package/dist/integrations/ai-sdk.mjs +239 -11
  35. package/dist/integrations/ai-sdk.mjs.map +1 -1
  36. package/dist/integrations/langchain.d.mts +132 -2
  37. package/dist/integrations/langchain.d.mts.map +1 -1
  38. package/dist/integrations/langchain.mjs +176 -8
  39. package/dist/integrations/langchain.mjs.map +1 -1
  40. package/dist/integrations/llamaindex.d.mts +1 -1
  41. package/dist/integrations/llamaindex.mjs +3 -3
  42. package/dist/integrations/mcp-client.mjs +4 -4
  43. package/dist/integrations/mcp-client.mjs.map +1 -1
  44. package/dist/integrations/mcp.d.mts +2 -2
  45. package/dist/integrations/mcp.d.mts.map +1 -1
  46. package/dist/integrations/mcp.mjs +6 -6
  47. package/dist/{mcp-R8kRLIKb.mjs → mcp-Bitg4sjX.mjs} +10 -37
  48. package/dist/mcp-Bitg4sjX.mjs.map +1 -0
  49. package/dist/microphone-D-6y9aiE.mjs +3 -0
  50. package/dist/{models-DKULvhOr.mjs → models-BAtL8qsA.mjs} +42 -7
  51. package/dist/models-BAtL8qsA.mjs.map +1 -0
  52. package/dist/{models-De2-_GmQ.d.mts → models-CE0fBq0U.d.mts} +2 -2
  53. package/dist/models-CE0fBq0U.d.mts.map +1 -0
  54. package/dist/{one-liner-BUQR0nqq.mjs → one-liner-B1rmFto6.mjs} +2 -2
  55. package/dist/{one-liner-BUQR0nqq.mjs.map → one-liner-B1rmFto6.mjs.map} +1 -1
  56. package/dist/repl-D20JO260.mjs +10 -0
  57. package/dist/skills/index.d.mts +303 -12
  58. package/dist/skills/index.d.mts.map +1 -1
  59. package/dist/skills/index.mjs +6 -6
  60. package/dist/skills-5DxAV-rn.mjs +1435 -0
  61. package/dist/skills-5DxAV-rn.mjs.map +1 -0
  62. package/dist/stt-Bv_dum-R.mjs +433 -0
  63. package/dist/stt-Bv_dum-R.mjs.map +1 -0
  64. package/dist/stt-KzSoNvwI.mjs +3 -0
  65. package/dist/{tools-BsiEE6f2.mjs → tools-IYPrqoek.mjs} +6 -7
  66. package/dist/{tools-BsiEE6f2.mjs.map → tools-IYPrqoek.mjs.map} +1 -1
  67. package/dist/tts-5yWeP_I0.mjs +3 -0
  68. package/dist/tts-DG6denWG.mjs +729 -0
  69. package/dist/tts-DG6denWG.mjs.map +1 -0
  70. package/dist/types-s6Py2_DL.d.mts +353 -0
  71. package/dist/types-s6Py2_DL.d.mts.map +1 -0
  72. package/dist/{utils-7vXqtq2Q.mjs → utils-CkB4Roi6.mjs} +1 -1
  73. package/dist/{utils-7vXqtq2Q.mjs.map → utils-CkB4Roi6.mjs.map} +1 -1
  74. package/docs/ai-sdk.md +137 -21
  75. package/docs/browser.md +241 -2
  76. package/docs/memory.md +72 -0
  77. package/docs/stt.md +494 -0
  78. package/docs/tts.md +569 -0
  79. package/docs/vision.md +396 -0
  80. package/package.json +17 -18
  81. package/dist/auto-update-BbNHbSU1.mjs +0 -3
  82. package/dist/chrome-backend-C5Un08O4.mjs.map +0 -1
  83. package/dist/gerbil-BfnsFWRE.mjs +0 -644
  84. package/dist/gerbil-BfnsFWRE.mjs.map +0 -1
  85. package/dist/gerbil-BjW-z7Fq.mjs +0 -5
  86. package/dist/gerbil-DZ1k3ChC.d.mts +0 -138
  87. package/dist/gerbil-DZ1k3ChC.d.mts.map +0 -1
  88. package/dist/mcp-R8kRLIKb.mjs.map +0 -1
  89. package/dist/models-DKULvhOr.mjs.map +0 -1
  90. package/dist/models-De2-_GmQ.d.mts.map +0 -1
  91. package/dist/skills-D3CEpgDc.mjs +0 -630
  92. package/dist/skills-D3CEpgDc.mjs.map +0 -1
  93. package/dist/types-BS1N92Jt.d.mts +0 -183
  94. package/dist/types-BS1N92Jt.d.mts.map +0 -1
package/docs/tts.md ADDED
@@ -0,0 +1,569 @@
1
+ # Text-to-Speech
2
+
3
+ Gerbil provides on-device text-to-speech using the [Kokoro-82M](https://huggingface.co/hexgrad/Kokoro-82M) model. Generate natural-sounding speech locally without API keys or internet connection.
4
+
5
+ ## Quick Start
6
+
7
+ ### Node.js / CLI
8
+
9
+ ```typescript
10
+ import { Gerbil } from "@tryhamster/gerbil";
11
+
12
+ const g = new Gerbil();
13
+
14
+ // Generate speech
15
+ const result = await g.speak("Hello, I'm Gerbil!", {
16
+ voice: "af_heart", // American female (highest quality)
17
+ speed: 1.0
18
+ });
19
+
20
+ // result.audio = Float32Array (PCM samples)
21
+ // result.sampleRate = 24000
22
+ // result.duration = seconds
23
+ ```
24
+
25
+ ### React (Browser)
26
+
27
+ ```tsx
28
+ import { useSpeech } from "@tryhamster/gerbil/browser";
29
+
30
+ function SpeechDemo() {
31
+ // Default: Kokoro TTS (24kHz, 28 voices)
32
+ const { speak, stop, isSpeaking, isLoading, listVoices, setVoice } = useSpeech();
33
+
34
+ // Or use Supertonic (44.1kHz, 4 voices, faster)
35
+ // const { speak, listVoices } = useSpeech({ model: "supertonic-66m" });
36
+
37
+ if (isLoading) return <div>Loading TTS model...</div>;
38
+
39
+ return (
40
+ <div>
41
+ <select onChange={e => setVoice(e.target.value)}>
42
+ {listVoices().map(v => (
43
+ <option key={v.id} value={v.id}>{v.name} ({v.language})</option>
44
+ ))}
45
+ </select>
46
+
47
+ <button onClick={() => speak("Hello world!")}>
48
+ {isSpeaking ? "Speaking..." : "Speak"}
49
+ </button>
50
+
51
+ {isSpeaking && <button onClick={stop}>Stop</button>}
52
+ </div>
53
+ );
54
+ }
55
+ ```
56
+
57
+ ### AI SDK
58
+
59
+ ```typescript
60
+ import { experimental_generateSpeech as generateSpeech } from "ai";
61
+ import { gerbil } from "@tryhamster/gerbil/ai";
62
+
63
+ const audio = await generateSpeech({
64
+ model: gerbil.speech(),
65
+ text: "Hello from Gerbil!",
66
+ voice: "bf_emma", // British female
67
+ });
68
+
69
+ // audio.audioData = Uint8Array (WAV format)
70
+ ```
71
+
72
+ ### CLI
73
+
74
+ ```bash
75
+ # Speak text with default voice
76
+ gerbil speak "Hello world"
77
+
78
+ # Specify voice and speed
79
+ gerbil speak --voice bf_emma --speed 1.2 "Cheerio!"
80
+ ```
81
+
82
+ ### Skills
83
+
84
+ ```typescript
85
+ import { speak, announce, readAloud } from "@tryhamster/gerbil/skills";
86
+
87
+ // Simple speech
88
+ await speak({ text: "Hello world", voice: "af_heart" });
89
+
90
+ // AI-crafted announcement
91
+ await announce({
92
+ message: "Build completed successfully",
93
+ style: "excited", // casual, formal, excited, calm, urgent
94
+ voice: "af_bella"
95
+ });
96
+
97
+ // Read file aloud
98
+ await readAloud({
99
+ content: "./README.md",
100
+ voice: "bf_emma",
101
+ summarizeIfLong: true // Summarize if > 5000 chars
102
+ });
103
+ ```
104
+
105
+ ## Available TTS Models
106
+
107
+ Gerbil supports multiple TTS backends:
108
+
109
+ | Model | Size | Sample Rate | Voices | Speed | Notes |
110
+ |-------|------|-------------|--------|-------|-------|
111
+ | `kokoro-82m` | ~330MB | 24000 Hz | 28 | ~100x RT | Default, highest quality |
112
+ | `supertonic-66m` | ~250MB | 44100 Hz | 4 | ~167x RT | Faster, HiFi output |
113
+
114
+ ### Selecting a Model
115
+
116
+ ```typescript
117
+ // Use default (Kokoro)
118
+ await g.loadTTS();
119
+
120
+ // Use Supertonic (faster, higher sample rate)
121
+ await g.loadTTS({ model: "supertonic-66m" });
122
+
123
+ // List available models
124
+ const models = await g.listTTSModels();
125
+ // [{ id: "kokoro-82m", sampleRate: 24000, voiceCount: 28 }, ...]
126
+ ```
127
+
128
+ ## Available Voices
129
+
130
+ ### Kokoro Voices
131
+
132
+ Kokoro provides 28 voices across American and British English:
133
+
134
+ ### American English - Female (Recommended)
135
+
136
+ | Voice ID | Name | Quality | Description |
137
+ |----------|------|---------|-------------|
138
+ | `af_heart` | Heart | A | Highest quality, warm tone |
139
+ | `af_bella` | Bella | A- | Warm and friendly |
140
+ | `af_nicole` | Nicole | B- | Soft and gentle |
141
+ | `af_sarah` | Sarah | C+ | Clear and professional |
142
+
143
+ ### American English - Male
144
+
145
+ | Voice ID | Name | Quality | Description |
146
+ |----------|------|---------|-------------|
147
+ | `am_fenrir` | Fenrir | C+ | Best male quality |
148
+ | `am_michael` | Michael | C+ | Warm and friendly |
149
+ | `am_puck` | Puck | C+ | Neutral tone |
150
+
151
+ ### British English - Female
152
+
153
+ | Voice ID | Name | Quality | Description |
154
+ |----------|------|---------|-------------|
155
+ | `bf_emma` | Emma | B- | Elegant and clear |
156
+ | `bf_isabella` | Isabella | C | Sophisticated |
157
+
158
+ ### British English - Male
159
+
160
+ | Voice ID | Name | Quality | Description |
161
+ |----------|------|---------|-------------|
162
+ | `bm_george` | George | C | Distinguished |
163
+ | `bm_fable` | Fable | C | Storyteller tone |
164
+
165
+ > **Tip:** `af_heart` is the highest quality voice. Start there and experiment with others.
166
+
167
+ ### Supertonic Voices
168
+
169
+ Supertonic provides 4 high-quality voices:
170
+
171
+ | Voice | Gender | Description |
172
+ |-------|--------|-------------|
173
+ | `F1` | Female | Clear and natural |
174
+ | `F2` | Female | Warm and expressive |
175
+ | `M1` | Male | Deep and confident |
176
+ | `M2` | Male | Friendly and casual |
177
+
178
+ ## API Reference
179
+
180
+ ### Gerbil Class Methods
181
+
182
+ ```typescript
183
+ class Gerbil {
184
+ // Load TTS model (auto-called by speak if needed)
185
+ async loadTTS(options?: LoadTTSOptions): Promise<void>;
186
+
187
+ // Check if TTS is loaded
188
+ isTTSLoaded(): boolean;
189
+
190
+ // Generate speech
191
+ async speak(text: string, options?: SpeakOptions): Promise<SpeakResult>;
192
+
193
+ // Stream speech (yields chunks as generated)
194
+ async *speakStream(text: string, options?: SpeakOptions): AsyncGenerator<AudioChunk>;
195
+
196
+ // Get available voices
197
+ listVoices(): VoiceInfo[];
198
+ getVoice(voiceId: string): VoiceInfo | null;
199
+ }
200
+ ```
201
+
202
+ ### SpeakOptions
203
+
204
+ ```typescript
205
+ interface SpeakOptions {
206
+ /** Voice ID (default: "af_heart") */
207
+ voice?: string;
208
+
209
+ /** Speed multiplier 0.5-2.0 (default: 1.0) */
210
+ speed?: number;
211
+
212
+ /** Progress callback during loading */
213
+ onProgress?: (info: ProgressInfo) => void;
214
+
215
+ /** Callback for each audio chunk (streaming) */
216
+ onAudioChunk?: (chunk: AudioChunk) => void;
217
+ }
218
+ ```
219
+
220
+ ### SpeakResult
221
+
222
+ ```typescript
223
+ interface SpeakResult {
224
+ /** PCM audio samples (mono, float32, -1 to 1) */
225
+ audio: Float32Array;
226
+
227
+ /** Sample rate (always 24000 for Kokoro) */
228
+ sampleRate: number;
229
+
230
+ /** Duration in seconds */
231
+ duration: number;
232
+
233
+ /** Voice ID used */
234
+ voice: string;
235
+
236
+ /** Generation time in milliseconds */
237
+ totalTime: number;
238
+ }
239
+ ```
240
+
241
+ ## Playing Audio
242
+
243
+ ### Browser (Web Audio API)
244
+
245
+ ```typescript
246
+ import { playAudio } from "@tryhamster/gerbil/browser";
247
+
248
+ const result = await gerbil.speak("Hello!");
249
+
250
+ // One-liner playback
251
+ const controller = await playAudio(result.audio, result.sampleRate);
252
+
253
+ // Stop early
254
+ controller.stop();
255
+
256
+ // Wait for completion
257
+ await controller.onEnded;
258
+ ```
259
+
260
+ ### Node.js (Save to File)
261
+
262
+ ```typescript
263
+ import { writeFileSync } from "fs";
264
+ import { execSync } from "child_process";
265
+
266
+ const result = await gerbil.speak("Hello!");
267
+
268
+ // Convert to WAV
269
+ function saveWav(filename: string, audio: Float32Array, sampleRate: number) {
270
+ const buffer = Buffer.alloc(44 + audio.length * 2);
271
+
272
+ // WAV header
273
+ buffer.write("RIFF", 0);
274
+ buffer.writeUInt32LE(36 + audio.length * 2, 4);
275
+ buffer.write("WAVE", 8);
276
+ buffer.write("fmt ", 12);
277
+ buffer.writeUInt32LE(16, 16);
278
+ buffer.writeUInt16LE(1, 20); // PCM
279
+ buffer.writeUInt16LE(1, 22); // Mono
280
+ buffer.writeUInt32LE(sampleRate, 24);
281
+ buffer.writeUInt32LE(sampleRate * 2, 28);
282
+ buffer.writeUInt16LE(2, 32);
283
+ buffer.writeUInt16LE(16, 34);
284
+ buffer.write("data", 36);
285
+ buffer.writeUInt32LE(audio.length * 2, 40);
286
+
287
+ // Audio data
288
+ for (let i = 0; i < audio.length; i++) {
289
+ const s = Math.max(-1, Math.min(1, audio[i]));
290
+ buffer.writeInt16LE(Math.round(s * 32767), 44 + i * 2);
291
+ }
292
+
293
+ writeFileSync(filename, buffer);
294
+ }
295
+
296
+ saveWav("output.wav", result.audio, result.sampleRate);
297
+
298
+ // Play on macOS
299
+ execSync("afplay output.wav");
300
+
301
+ // Play on Linux
302
+ // execSync("aplay output.wav");
303
+ ```
304
+
305
+ ## Streaming Audio
306
+
307
+ For long text, stream audio chunks as they're generated:
308
+
309
+ ```typescript
310
+ // Node.js streaming
311
+ for await (const chunk of gerbil.speakStream("Long paragraph of text...")) {
312
+ console.log(`Chunk ${chunk.index}: ${chunk.samples.length} samples`);
313
+ // Process/play chunk.samples
314
+
315
+ if (chunk.isFinal) {
316
+ console.log("Done!");
317
+ }
318
+ }
319
+
320
+ // Browser streaming with seamless playback
321
+ import { createAudioPlayer } from "@tryhamster/gerbil/browser";
322
+
323
+ const player = createAudioPlayer(24000);
324
+
325
+ for await (const chunk of gerbil.speakStream("Long text...")) {
326
+ player.queue(chunk.samples);
327
+ }
328
+ ```
329
+
330
+ ## useSpeech Hook Reference
331
+
332
+ ```typescript
333
+ const {
334
+ // Actions
335
+ speak, // (text: string, opts?) => Promise<void>
336
+ stop, // () => void
337
+ load, // () => void (manual load trigger)
338
+
339
+ // State
340
+ isLoading, // boolean - model loading
341
+ isSpeaking, // boolean - currently speaking
342
+ isReady, // boolean - model ready
343
+ error, // string | null
344
+
345
+ // Voice control
346
+ listVoices, // () => VoiceInfo[] - voices for current model
347
+ currentVoice, // string
348
+ setVoice, // (id: string) => void
349
+ currentSpeed, // number
350
+ setSpeed, // (speed: number) => void
351
+
352
+ // Model info
353
+ currentModel, // TTSModelId - "kokoro-82m" | "supertonic-66m"
354
+ sampleRate, // number - 24000 (Kokoro) or 44100 (Supertonic)
355
+
356
+ // Loading progress
357
+ loadingProgress, // { status, file?, progress? }
358
+ } = useSpeech({
359
+ model: "kokoro-82m", // TTS model: "kokoro-82m" (default) or "supertonic-66m"
360
+ voice: "af_heart", // default voice (model-specific)
361
+ speed: 1.0, // default speed
362
+ autoLoad: false, // load on first speak() call
363
+ onReady: () => {},
364
+ onError: (err) => {},
365
+ onStart: () => {},
366
+ onEnd: () => {},
367
+ });
368
+ ```
369
+
370
+ ### Model-Specific Voices
371
+
372
+ Each model has its own set of voices:
373
+
374
+ ```typescript
375
+ // Kokoro (default) - 28 voices
376
+ const { listVoices } = useSpeech(); // or model: "kokoro-82m"
377
+ // → af_heart, af_bella, am_fenrir, bf_emma, bm_george, ...
378
+
379
+ // Supertonic - 4 voices
380
+ const { listVoices } = useSpeech({ model: "supertonic-66m" });
381
+ // → F1, F2, M1, M2
382
+ ```
383
+
384
+ **Note:** Voice IDs are model-specific. Using a Kokoro voice with Supertonic (or vice versa) will throw an error.
385
+
386
+ ## Performance
387
+
388
+ | Metric | Value |
389
+ |--------|-------|
390
+ | Model size | ~330MB |
391
+ | First load | 3-10s (downloads model) |
392
+ | Cached load | <1s |
393
+ | Generation speed | ~8x realtime on M1 Mac |
394
+ | Sample rate | 24kHz |
395
+ | Audio format | Mono Float32 PCM |
396
+
397
+ ## Troubleshooting
398
+
399
+ ### Audio sounds like gibberish
400
+
401
+ This happens when using raw transformers.js without proper phoneme conversion. Gerbil uses `kokoro-js` which handles grapheme-to-phoneme (G2P) conversion automatically.
402
+
403
+ ### Audio is too quiet
404
+
405
+ Kokoro output can be quiet. The audio is normalized by default, but if you're processing raw audio, ensure proper normalization:
406
+
407
+ ```typescript
408
+ function normalizeAudio(audio: Float32Array, targetRms = 0.15): Float32Array {
409
+ const currentRms = Math.sqrt(audio.reduce((a, b) => a + b * b, 0) / audio.length);
410
+ const gain = targetRms / currentRms;
411
+ return audio.map(s => Math.max(-1, Math.min(1, s * gain)));
412
+ }
413
+ ```
414
+
415
+ ### Browser autoplay blocked
416
+
417
+ Browsers require user interaction before playing audio. Trigger speech from a click handler:
418
+
419
+ ```tsx
420
+ <button onClick={() => speak("Hello!")}>Speak</button>
421
+ ```
422
+
423
+ ### Voice not found
424
+
425
+ Use `listVoices()` to see available voice IDs. Voice IDs follow the pattern `{language}{gender}_{name}`:
426
+ - `af_` = American female
427
+ - `am_` = American male
428
+ - `bf_` = British female
429
+ - `bm_` = British male
430
+
431
+ ## Model Info
432
+
433
+ ### Kokoro-82M
434
+
435
+ Uses [Kokoro-82M](https://huggingface.co/hexgrad/Kokoro-82M) via the [ONNX export](https://huggingface.co/onnx-community/Kokoro-82M-v1.0-ONNX) and [kokoro-js](https://www.npmjs.com/package/kokoro-js) wrapper.
436
+
437
+ - **Architecture:** StyleTTS2-based
438
+ - **Parameters:** 82M
439
+ - **Languages:** English (US + UK)
440
+ - **License:** Apache 2.0
441
+
442
+ ### Supertonic-66M
443
+
444
+ Uses [Supertonic-TTS](https://huggingface.co/onnx-community/Supertonic-TTS-ONNX) via transformers.js pipeline.
445
+
446
+ - **Parameters:** 66M
447
+ - **Languages:** English
448
+ - **Sample Rate:** 44.1kHz (HiFi)
449
+ - **Speed:** ~167x realtime
450
+ - **License:** Apache 2.0
451
+
452
+ ---
453
+
454
+ ## Adding New TTS Models (Developer Guide)
455
+
456
+ To add a new TTS model to Gerbil, implement it in these locations:
457
+
458
+ ### 1. Core TTS Class (`src/core/tts.ts`)
459
+
460
+ Add your model to the TTS registry and create a class that implements the TTS interface:
461
+
462
+ ```typescript
463
+ // 1. Add voice definitions
464
+ export const MY_MODEL_VOICES: VoiceInfo[] = [
465
+ { id: "voice1", name: "Voice 1", gender: "female", language: "en", description: "..." },
466
+ // ...
467
+ ];
468
+
469
+ // 2. Add to TTS_MODELS registry
470
+ export const TTS_MODELS: Record<string, TTSModelConfig> = {
471
+ // ...existing models...
472
+ "my-model-50m": {
473
+ id: "my-model-50m",
474
+ repo: "hf-org/my-model-onnx",
475
+ description: "My Model 50M - Description",
476
+ size: "~150MB",
477
+ sampleRate: 22050,
478
+ voices: MY_MODEL_VOICES,
479
+ defaultVoice: "voice1",
480
+ languages: ["en"],
481
+ },
482
+ };
483
+
484
+ // 3. Create TTS class (see KokoroTTS or SupertonicTTS as examples)
485
+ export class MyModelTTS {
486
+ async load(options: LoadTTSOptions): Promise<void> { /* ... */ }
487
+ async speak(text: string, options: SpeakOptions): Promise<SpeakResult> { /* ... */ }
488
+ listVoices(): VoiceInfo[] { /* ... */ }
489
+ // ...
490
+ }
491
+
492
+ // 4. Update createTTS factory
493
+ export function createTTS(modelId: string = "kokoro-82m"): TTSBackend {
494
+ if (modelId.startsWith("my-model")) {
495
+ return new MyModelTTS(modelId);
496
+ }
497
+ // ...existing logic...
498
+ }
499
+ ```
500
+
501
+ ### 2. Browser Hook (`src/browser/index.ts`)
502
+
503
+ Add browser support in the useSpeech hook:
504
+
505
+ ```typescript
506
+ // 1. Add voice definitions for browser
507
+ const MY_MODEL_BROWSER_VOICES: BrowserVoiceInfo[] = [
508
+ { id: "voice1", name: "Voice 1", gender: "female", language: "en", description: "..." },
509
+ ];
510
+
511
+ // 2. Update TTS_MODELS in browser
512
+ const TTS_MODELS: Record<TTSModelId, {...}> = {
513
+ // ...existing models...
514
+ "my-model-50m": {
515
+ repo: "hf-org/my-model-onnx",
516
+ defaultVoice: "voice1",
517
+ sampleRate: 22050,
518
+ voices: MY_MODEL_BROWSER_VOICES,
519
+ },
520
+ };
521
+
522
+ // 3. Update TTSModelId type
523
+ export type TTSModelId = "kokoro-82m" | "supertonic-66m" | "my-model-50m";
524
+
525
+ // 4. Add loading logic in useSpeech useEffect
526
+ if (modelId === "my-model-50m") {
527
+ // Load using appropriate method (pipeline, custom loader, etc.)
528
+ }
529
+ ```
530
+
531
+ ### 3. Gerbil Class (`src/core/gerbil.ts`)
532
+
533
+ Update the main Gerbil class to support the new model:
534
+
535
+ ```typescript
536
+ // Add type import
537
+ type MyModelTTSType = import("./tts.js").MyModelTTS;
538
+
539
+ // Update TTSBackendType union
540
+ type TTSBackendType = KokoroTTSType | SupertonicTTSType | MyModelTTSType;
541
+ ```
542
+
543
+ ### 4. Integration Points
544
+
545
+ Update these integration files if applicable:
546
+
547
+ - **AI SDK** (`src/integrations/ai-sdk.ts`): Add speech model support
548
+ - **Skills** (`src/skills/builtin/speak.ts`, etc.): Update voice enums
549
+ - **CLI** (`src/cli/index.ts`): Add model to CLI options
550
+
551
+ ### 5. Documentation
552
+
553
+ Update documentation:
554
+
555
+ - `docs/tts.md`: Add model to tables and examples
556
+ - `.cursor/rules/gerbil/tts.mdc`: Update implementation patterns
557
+
558
+ ### File Summary
559
+
560
+ | File | Purpose |
561
+ |------|---------|
562
+ | `src/core/tts.ts` | Core TTS classes, voice registry, model config |
563
+ | `src/browser/index.ts` | React hooks (useSpeech, useVoiceChat) |
564
+ | `src/core/gerbil.ts` | Main Gerbil class integration |
565
+ | `src/integrations/ai-sdk.ts` | AI SDK v5 speech model |
566
+ | `src/skills/builtin/speak.ts` | Speak skill voice options |
567
+ | `docs/tts.md` | User documentation |
568
+ | `.cursor/rules/gerbil/tts.mdc` | Developer patterns |
569
+