@tryhamster/gerbil 1.0.0-rc.9 → 1.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +1 -1
- package/README.md +318 -104
- package/dist/architectures-C1I5V3Dt.mjs +6070 -0
- package/dist/architectures-C1I5V3Dt.mjs.map +1 -0
- package/dist/browser/index.d.ts +276 -590
- package/dist/browser/index.d.ts.map +1 -1
- package/dist/browser/index.js +592 -2334
- package/dist/browser/index.js.map +1 -1
- package/dist/cli.mjs +625 -1098
- package/dist/cli.mjs.map +1 -1
- package/dist/defaults-9komdrbY.mjs +24 -0
- package/dist/defaults-9komdrbY.mjs.map +1 -0
- package/dist/frameworks/express.d.mts +1 -3
- package/dist/frameworks/express.d.mts.map +1 -1
- package/dist/frameworks/express.mjs +7 -7
- package/dist/frameworks/express.mjs.map +1 -1
- package/dist/frameworks/fastify.d.mts +1 -1
- package/dist/frameworks/fastify.d.mts.map +1 -1
- package/dist/frameworks/fastify.mjs +3 -3
- package/dist/frameworks/fastify.mjs.map +1 -1
- package/dist/frameworks/hono.d.mts +1 -1
- package/dist/frameworks/hono.d.mts.map +1 -1
- package/dist/frameworks/hono.mjs +4 -4
- package/dist/frameworks/hono.mjs.map +1 -1
- package/dist/frameworks/next.d.mts +3 -2
- package/dist/frameworks/next.d.mts.map +1 -1
- package/dist/frameworks/next.mjs +4 -4
- package/dist/frameworks/next.mjs.map +1 -1
- package/dist/frameworks/react.d.mts +1 -1
- package/dist/frameworks/trpc.d.mts +1 -1
- package/dist/frameworks/trpc.d.mts.map +1 -1
- package/dist/frameworks/trpc.mjs +4 -4
- package/dist/frameworks/trpc.mjs.map +1 -1
- package/dist/gerbil-BetB5xb0.d.mts +488 -0
- package/dist/gerbil-BetB5xb0.d.mts.map +1 -0
- package/dist/gerbil-CTZUa8EZ.mjs +4 -0
- package/dist/gerbil-DNniplr4.mjs +1656 -0
- package/dist/gerbil-DNniplr4.mjs.map +1 -0
- package/dist/gpu/hooks.d.mts +640 -0
- package/dist/gpu/hooks.d.mts.map +1 -0
- package/dist/gpu/hooks.mjs +1369 -0
- package/dist/gpu/hooks.mjs.map +1 -0
- package/dist/gpu/index.d.mts +2 -0
- package/dist/gpu/index.mjs +6 -0
- package/dist/gpu-DFuglcEx.mjs +3790 -0
- package/dist/gpu-DFuglcEx.mjs.map +1 -0
- package/dist/index-Dgmb2kE3.d.mts +245 -0
- package/dist/index-Dgmb2kE3.d.mts.map +1 -0
- package/dist/index-DukkJRMj.d.mts +2114 -0
- package/dist/index-DukkJRMj.d.mts.map +1 -0
- package/dist/index.d.mts +22 -487
- package/dist/index.d.mts.map +1 -1
- package/dist/index.mjs +13 -8
- package/dist/index.mjs.map +1 -1
- package/dist/indexeddb-store-BWIMtxxH.mjs +103 -0
- package/dist/indexeddb-store-BWIMtxxH.mjs.map +1 -0
- package/dist/indexeddb-store-ClH12Xnl.mjs +4 -0
- package/dist/integrations/ai-sdk.d.mts +75 -6
- package/dist/integrations/ai-sdk.d.mts.map +1 -1
- package/dist/integrations/ai-sdk.mjs +131 -15
- package/dist/integrations/ai-sdk.mjs.map +1 -1
- package/dist/integrations/langchain.d.mts +1 -1
- package/dist/integrations/langchain.d.mts.map +1 -1
- package/dist/integrations/langchain.mjs +5 -5
- package/dist/integrations/langchain.mjs.map +1 -1
- package/dist/integrations/llamaindex.d.mts +1 -1
- package/dist/integrations/llamaindex.d.mts.map +1 -1
- package/dist/integrations/llamaindex.mjs +5 -5
- package/dist/integrations/llamaindex.mjs.map +1 -1
- package/dist/integrations/mcp-client.mjs +3 -3
- package/dist/integrations/mcp-client.mjs.map +1 -1
- package/dist/integrations/mcp.d.mts +3 -2
- package/dist/integrations/mcp.d.mts.map +1 -1
- package/dist/integrations/mcp.mjs +5 -5
- package/dist/{mcp-BvbriaBy.mjs → mcp-D2vvH1Xc.mjs} +4 -4
- package/dist/mcp-D2vvH1Xc.mjs.map +1 -0
- package/dist/memory/index.d.mts +3 -0
- package/dist/memory/index.mjs +6 -0
- package/dist/memory-D1P7Tmda.mjs +4 -0
- package/dist/memory-DVN0MnIG.mjs +132 -0
- package/dist/memory-DVN0MnIG.mjs.map +1 -0
- package/dist/memory-Dj0J1v88.mjs +294 -0
- package/dist/memory-Dj0J1v88.mjs.map +1 -0
- package/dist/moonshine-stt-17dpP1kr.mjs +4 -0
- package/dist/moonshine-stt-4ojLtMq7.mjs +11962 -0
- package/dist/moonshine-stt-4ojLtMq7.mjs.map +1 -0
- package/dist/{one-liner-s-lD8rCC.mjs → one-liner-JhdIPxzF.mjs} +14 -16
- package/dist/one-liner-JhdIPxzF.mjs.map +1 -0
- package/dist/repl-BDRkwPGX.mjs +9 -0
- package/dist/skills/index.d.mts +270 -320
- package/dist/skills/index.d.mts.map +1 -1
- package/dist/skills/index.mjs +5 -5
- package/dist/{skills-CD3Orlex.mjs → skills-CU694Dc8.mjs} +187 -32
- package/dist/skills-CU694Dc8.mjs.map +1 -0
- package/dist/{tools-Bi1P7Xoy.mjs → tools-DQ1mPUw5.mjs} +34 -22
- package/dist/tools-DQ1mPUw5.mjs.map +1 -0
- package/dist/types-DQBe2lFo.d.mts +165 -0
- package/dist/types-DQBe2lFo.d.mts.map +1 -0
- package/dist/{types-CiTc7ez3.d.mts → types-LlyYILII.d.mts} +112 -14
- package/dist/types-LlyYILII.d.mts.map +1 -0
- package/dist/{utils-CZBZ8dgR.mjs → utils-DKO55ZmZ.mjs} +1 -1
- package/dist/{utils-CZBZ8dgR.mjs.map → utils-DKO55ZmZ.mjs.map} +1 -1
- package/dist/vector-B0panuy6.mjs +95 -0
- package/dist/vector-B0panuy6.mjs.map +1 -0
- package/docs/PROJECT-STATE.md +321 -0
- package/docs/adding-a-model-family.md +280 -0
- package/docs/ai-sdk.md +70 -61
- package/docs/architecture/overview.md +17 -7
- package/docs/browser.md +203 -8
- package/docs/embeddings.md +156 -0
- package/docs/gerbil-site-native-migration.md +217 -0
- package/docs/gpu-engine/architectures.md +398 -0
- package/docs/gpu-engine/ir.md +372 -0
- package/docs/gpu-engine/kernels.md +718 -0
- package/docs/gpu-engine/paper.html +1759 -0
- package/docs/gpu-engine/paper.md +2109 -0
- package/docs/gpu-engine/safetensors.md +312 -0
- package/docs/gpu-engine/tokenizer.md +302 -0
- package/docs/memory-rag.md +91 -0
- package/docs/metal-safari-intel.md +190 -0
- package/docs/mobile-failure-diagnosis.md +124 -0
- package/docs/mobile.md +99 -0
- package/docs/observability.md +230 -0
- package/docs/onnx-removal-plan.md +339 -0
- package/docs/research/autoresearch-portable.md +904 -0
- package/docs/research/dispatch-reduction-hivemind.md +84 -0
- package/docs/research/ios-safari-model-caching.md +117 -0
- package/docs/research/mobile-webgpu-speed-fusion.md +135 -0
- package/docs/research/native-stt-model-selection.md +49 -0
- package/docs/research/native-tts-model-selection.md +90 -0
- package/docs/research/native-vs-chromium-decision.md +152 -0
- package/docs/research/nemotron-mamba2-inference.md +910 -0
- package/docs/research/qwen35-multimodal.md +293 -0
- package/docs/research/qwen36-gemma4-targets.md +337 -0
- package/docs/research/sota-embedding-models.md +179 -0
- package/docs/research/sota-mobile-models-2026.md +263 -0
- package/docs/research/sota-modality-models.md +202 -0
- package/docs/research/tps-baselines.md +71 -0
- package/docs/research/webgpu-m4-reference.md +104 -0
- package/docs/site-update-plan.md +155 -0
- package/docs/structured-output.md +123 -0
- package/docs/stt.md +63 -446
- package/docs/tts.md +77 -499
- package/docs/vision.md +100 -338
- package/package.json +22 -7
- package/dist/chrome-backend-CORwaIyC.mjs +0 -1212
- package/dist/chrome-backend-CORwaIyC.mjs.map +0 -1
- package/dist/chrome-backend-DIKYoWj-.mjs +0 -3
- package/dist/gerbil-CJ3ifloF.mjs +0 -4
- package/dist/gerbil-Dw4Qj77e.mjs +0 -1631
- package/dist/gerbil-Dw4Qj77e.mjs.map +0 -1
- package/dist/gerbil-qOTe1nl2.d.mts +0 -431
- package/dist/gerbil-qOTe1nl2.d.mts.map +0 -1
- package/dist/kokoro-BNTb6egA.mjs +0 -20210
- package/dist/kokoro-BNTb6egA.mjs.map +0 -1
- package/dist/kokoro-CMOGDSgT.js +0 -20212
- package/dist/kokoro-CMOGDSgT.js.map +0 -1
- package/dist/mcp-BvbriaBy.mjs.map +0 -1
- package/dist/one-liner-s-lD8rCC.mjs.map +0 -1
- package/dist/repl-DveXw36T.mjs +0 -9
- package/dist/skills-CD3Orlex.mjs.map +0 -1
- package/dist/stt-Bu-E23Sc.js +0 -433
- package/dist/stt-Bu-E23Sc.js.map +0 -1
- package/dist/stt-CpLYbGFd.mjs +0 -433
- package/dist/stt-CpLYbGFd.mjs.map +0 -1
- package/dist/stt-DRPLEEHB.mjs +0 -3
- package/dist/tools-Bi1P7Xoy.mjs.map +0 -1
- package/dist/transformers.web-DiD1gTwk.js +0 -44695
- package/dist/transformers.web-DiD1gTwk.js.map +0 -1
- package/dist/transformers.web-u34VxRFM.js +0 -3
- package/dist/tts-CqroPaSK.js +0 -724
- package/dist/tts-CqroPaSK.js.map +0 -1
- package/dist/tts-DXgsKGCe.mjs +0 -3
- package/dist/tts-DeGANMNV.mjs +0 -730
- package/dist/tts-DeGANMNV.mjs.map +0 -1
- package/dist/types-CiTc7ez3.d.mts.map +0 -1
- /package/dist/{auto-update-S9s5-g0C.mjs → auto-update-BVaLXcDE.mjs} +0 -0
- /package/dist/{chunk-CkXuGtQK.mjs → chunk-B9cbKln6.mjs} +0 -0
- /package/dist/{microphone-DaMZFRuR.mjs → microphone-Bqmoz9_K.mjs} +0 -0
package/docs/tts.md
CHANGED
|
@@ -1,569 +1,147 @@
|
|
|
1
1
|
# Text-to-Speech
|
|
2
2
|
|
|
3
|
-
Gerbil
|
|
3
|
+
Gerbil's native WebGPU engine synthesizes speech with **Kani-TTS-2** — an LFM2-350M
|
|
4
|
+
codec-LM backbone driving NVIDIA's NeMo **NanoCodec** decoder (FSQ + causal HiFi-GAN).
|
|
5
|
+
`engine.speak()` returns **22.05 kHz mono PCM**, fully on-device, no ONNX.
|
|
4
6
|
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
```typescript
|
|
10
|
-
import { Gerbil } from "@tryhamster/gerbil";
|
|
11
|
-
|
|
12
|
-
const g = new Gerbil();
|
|
13
|
-
|
|
14
|
-
// Generate speech
|
|
15
|
-
const result = await g.speak("Hello, I'm Gerbil!", {
|
|
16
|
-
voice: "af_heart", // American female (highest quality)
|
|
17
|
-
speed: 1.0
|
|
18
|
-
});
|
|
7
|
+
> **Pre-1.0.** Kani-TTS-2 is the only TTS path. The old Kokoro / Supertonic ONNX /
|
|
8
|
+
> transformers.js lane has been removed. The `Gerbil`-class `speak()` method still works for
|
|
9
|
+
> backward compatibility but is now a thin wrapper over the native engine (see
|
|
10
|
+
> [below](#gerbil-class-speak-native-wrapper)).
|
|
19
11
|
|
|
20
|
-
|
|
21
|
-
// result.sampleRate = 24000
|
|
22
|
-
// result.duration = seconds
|
|
23
|
-
```
|
|
24
|
-
|
|
25
|
-
### React (Browser)
|
|
26
|
-
|
|
27
|
-
```tsx
|
|
28
|
-
import { useSpeech } from "@tryhamster/gerbil/browser";
|
|
29
|
-
|
|
30
|
-
function SpeechDemo() {
|
|
31
|
-
// Default: Kokoro TTS (24kHz, 28 voices)
|
|
32
|
-
const { speak, stop, isSpeaking, isLoading, listVoices, setVoice } = useSpeech();
|
|
33
|
-
|
|
34
|
-
// Or use Supertonic (44.1kHz, 4 voices, faster)
|
|
35
|
-
// const { speak, listVoices } = useSpeech({ model: "supertonic-66m" });
|
|
36
|
-
|
|
37
|
-
if (isLoading) return <div>Loading TTS model...</div>;
|
|
38
|
-
|
|
39
|
-
return (
|
|
40
|
-
<div>
|
|
41
|
-
<select onChange={e => setVoice(e.target.value)}>
|
|
42
|
-
{listVoices().map(v => (
|
|
43
|
-
<option key={v.id} value={v.id}>{v.name} ({v.language})</option>
|
|
44
|
-
))}
|
|
45
|
-
</select>
|
|
46
|
-
|
|
47
|
-
<button onClick={() => speak("Hello world!")}>
|
|
48
|
-
{isSpeaking ? "Speaking..." : "Speak"}
|
|
49
|
-
</button>
|
|
50
|
-
|
|
51
|
-
{isSpeaking && <button onClick={stop}>Stop</button>}
|
|
52
|
-
</div>
|
|
53
|
-
);
|
|
54
|
-
}
|
|
55
|
-
```
|
|
12
|
+
## Quick Start
|
|
56
13
|
|
|
57
|
-
###
|
|
14
|
+
### Node
|
|
58
15
|
|
|
59
16
|
```typescript
|
|
60
|
-
import {
|
|
61
|
-
import { gerbil } from "@tryhamster/gerbil/ai";
|
|
62
|
-
|
|
63
|
-
const audio = await generateSpeech({
|
|
64
|
-
model: gerbil.speech(),
|
|
65
|
-
text: "Hello from Gerbil!",
|
|
66
|
-
voice: "bf_emma", // British female
|
|
67
|
-
});
|
|
68
|
-
|
|
69
|
-
// audio.audioData = Uint8Array (WAV format)
|
|
70
|
-
```
|
|
17
|
+
import { WebGPUEngine } from "@tryhamster/gerbil/gpu";
|
|
71
18
|
|
|
72
|
-
|
|
19
|
+
const engine = await WebGPUEngine.create({ repo: "nineninesix/kani-tts-2-en" });
|
|
73
20
|
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
21
|
+
const { pcm, sampleRate, audioSeconds } = await engine.speak("Hello, I'm Gerbil!");
|
|
22
|
+
// pcm: Float32Array in [-1, 1], sampleRate === 22050
|
|
23
|
+
console.log(`${audioSeconds.toFixed(2)}s of audio`);
|
|
77
24
|
|
|
78
|
-
|
|
79
|
-
gerbil speak --voice bf_emma --speed 1.2 "Cheerio!"
|
|
25
|
+
engine.destroy();
|
|
80
26
|
```
|
|
81
27
|
|
|
82
|
-
###
|
|
28
|
+
### Options
|
|
83
29
|
|
|
84
30
|
```typescript
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
//
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
//
|
|
91
|
-
await announce({
|
|
92
|
-
message: "Build completed successfully",
|
|
93
|
-
style: "excited", // casual, formal, excited, calm, urgent
|
|
94
|
-
voice: "af_bella"
|
|
95
|
-
});
|
|
96
|
-
|
|
97
|
-
// Read file aloud
|
|
98
|
-
await readAloud({
|
|
99
|
-
content: "./README.md",
|
|
100
|
-
voice: "bf_emma",
|
|
101
|
-
summarizeIfLong: true // Summarize if > 5000 chars
|
|
31
|
+
const result = await engine.speak("Speak with feeling.", {
|
|
32
|
+
languageTag: "en_us", // prepended as "{tag}: {text}" (default "en_us")
|
|
33
|
+
temperature: 1.0, // sampling temperature (default 1.0)
|
|
34
|
+
topP: 0.95, // nucleus threshold (default 0.95)
|
|
35
|
+
repetitionPenalty: 1.1, // (default 1.1)
|
|
36
|
+
maxFrames: 2000, // cap audio length (default: unbounded up to maxSeqLen)
|
|
102
37
|
});
|
|
103
38
|
```
|
|
104
39
|
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
Gerbil supports multiple TTS backends:
|
|
108
|
-
|
|
109
|
-
| Model | Size | Sample Rate | Voices | Speed | Notes |
|
|
110
|
-
|-------|------|-------------|--------|-------|-------|
|
|
111
|
-
| `kokoro-82m` | ~330MB | 24000 Hz | 28 | ~100x RT | Default, highest quality |
|
|
112
|
-
| `supertonic-66m` | ~250MB | 44100 Hz | 4 | ~167x RT | Faster, HiFi output |
|
|
113
|
-
|
|
114
|
-
### Selecting a Model
|
|
115
|
-
|
|
116
|
-
```typescript
|
|
117
|
-
// Use default (Kokoro)
|
|
118
|
-
await g.loadTTS();
|
|
119
|
-
|
|
120
|
-
// Use Supertonic (faster, higher sample rate)
|
|
121
|
-
await g.loadTTS({ model: "supertonic-66m" });
|
|
122
|
-
|
|
123
|
-
// List available models
|
|
124
|
-
const models = await g.listTTSModels();
|
|
125
|
-
// [{ id: "kokoro-82m", sampleRate: 24000, voiceCount: 28 }, ...]
|
|
126
|
-
```
|
|
127
|
-
|
|
128
|
-
## Available Voices
|
|
129
|
-
|
|
130
|
-
### Kokoro Voices
|
|
131
|
-
|
|
132
|
-
Kokoro provides 28 voices across American and British English:
|
|
133
|
-
|
|
134
|
-
### American English - Female (Recommended)
|
|
40
|
+
`speak()` requires a Kani-TTS-2 checkpoint (architecture `KaniTTS2ForCausalLM`). The
|
|
41
|
+
NanoCodec decoder checkpoint is downloaded lazily on first `speak()` call.
|
|
135
42
|
|
|
136
|
-
|
|
137
|
-
|----------|------|---------|-------------|
|
|
138
|
-
| `af_heart` | Heart | A | Highest quality, warm tone |
|
|
139
|
-
| `af_bella` | Bella | A- | Warm and friendly |
|
|
140
|
-
| `af_nicole` | Nicole | B- | Soft and gentle |
|
|
141
|
-
| `af_sarah` | Sarah | C+ | Clear and professional |
|
|
142
|
-
|
|
143
|
-
### American English - Male
|
|
144
|
-
|
|
145
|
-
| Voice ID | Name | Quality | Description |
|
|
146
|
-
|----------|------|---------|-------------|
|
|
147
|
-
| `am_fenrir` | Fenrir | C+ | Best male quality |
|
|
148
|
-
| `am_michael` | Michael | C+ | Warm and friendly |
|
|
149
|
-
| `am_puck` | Puck | C+ | Neutral tone |
|
|
150
|
-
|
|
151
|
-
### British English - Female
|
|
152
|
-
|
|
153
|
-
| Voice ID | Name | Quality | Description |
|
|
154
|
-
|----------|------|---------|-------------|
|
|
155
|
-
| `bf_emma` | Emma | B- | Elegant and clear |
|
|
156
|
-
| `bf_isabella` | Isabella | C | Sophisticated |
|
|
157
|
-
|
|
158
|
-
### British English - Male
|
|
159
|
-
|
|
160
|
-
| Voice ID | Name | Quality | Description |
|
|
161
|
-
|----------|------|---------|-------------|
|
|
162
|
-
| `bm_george` | George | C | Distinguished |
|
|
163
|
-
| `bm_fable` | Fable | C | Storyteller tone |
|
|
164
|
-
|
|
165
|
-
> **Tip:** `af_heart` is the highest quality voice. Start there and experiment with others.
|
|
166
|
-
|
|
167
|
-
### Supertonic Voices
|
|
168
|
-
|
|
169
|
-
Supertonic provides 4 high-quality voices:
|
|
170
|
-
|
|
171
|
-
| Voice | Gender | Description |
|
|
172
|
-
|-------|--------|-------------|
|
|
173
|
-
| `F1` | Female | Clear and natural |
|
|
174
|
-
| `F2` | Female | Warm and expressive |
|
|
175
|
-
| `M1` | Male | Deep and confident |
|
|
176
|
-
| `M2` | Male | Friendly and casual |
|
|
177
|
-
|
|
178
|
-
## API Reference
|
|
179
|
-
|
|
180
|
-
### Gerbil Class Methods
|
|
43
|
+
## Result
|
|
181
44
|
|
|
182
45
|
```typescript
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
// Stream speech (yields chunks as generated)
|
|
194
|
-
async *speakStream(text: string, options?: SpeakOptions): AsyncGenerator<AudioChunk>;
|
|
195
|
-
|
|
196
|
-
// Get available voices
|
|
197
|
-
listVoices(): VoiceInfo[];
|
|
198
|
-
getVoice(voiceId: string): VoiceInfo | null;
|
|
46
|
+
interface SpeakResult {
|
|
47
|
+
/** Mono PCM in [-1, 1]. */
|
|
48
|
+
pcm: Float32Array;
|
|
49
|
+
/** Sample rate — always 22050. */
|
|
50
|
+
sampleRate: number;
|
|
51
|
+
/** Number of audio frames decoded. */
|
|
52
|
+
frames: number;
|
|
53
|
+
/** Audio duration in seconds. */
|
|
54
|
+
audioSeconds: number;
|
|
199
55
|
}
|
|
200
56
|
```
|
|
201
57
|
|
|
202
|
-
|
|
58
|
+
## Standalone KaniTTS
|
|
59
|
+
|
|
60
|
+
`engine.speak()` lazily builds a `KaniTTS` engine under the hood. You can also use it
|
|
61
|
+
directly (e.g. without a separate text model):
|
|
203
62
|
|
|
204
63
|
```typescript
|
|
205
|
-
|
|
206
|
-
/** Voice ID (default: "af_heart") */
|
|
207
|
-
voice?: string;
|
|
208
|
-
|
|
209
|
-
/** Speed multiplier 0.5-2.0 (default: 1.0) */
|
|
210
|
-
speed?: number;
|
|
211
|
-
|
|
212
|
-
/** Progress callback during loading */
|
|
213
|
-
onProgress?: (info: ProgressInfo) => void;
|
|
214
|
-
|
|
215
|
-
/** Callback for each audio chunk (streaming) */
|
|
216
|
-
onAudioChunk?: (chunk: AudioChunk) => void;
|
|
217
|
-
}
|
|
218
|
-
```
|
|
64
|
+
import { KaniTTS } from "@tryhamster/gerbil/gpu";
|
|
219
65
|
|
|
220
|
-
|
|
66
|
+
const tts = await KaniTTS.create({
|
|
67
|
+
repo: "nineninesix/kani-tts-2-en",
|
|
68
|
+
// codecRepo defaults to the NeMo 22 kHz NanoCodec MLX checkpoint
|
|
69
|
+
});
|
|
221
70
|
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
/** PCM audio samples (mono, float32, -1 to 1) */
|
|
225
|
-
audio: Float32Array;
|
|
226
|
-
|
|
227
|
-
/** Sample rate (always 24000 for Kokoro) */
|
|
228
|
-
sampleRate: number;
|
|
229
|
-
|
|
230
|
-
/** Duration in seconds */
|
|
231
|
-
duration: number;
|
|
232
|
-
|
|
233
|
-
/** Voice ID used */
|
|
234
|
-
voice: string;
|
|
235
|
-
|
|
236
|
-
/** Generation time in milliseconds */
|
|
237
|
-
totalTime: number;
|
|
238
|
-
}
|
|
71
|
+
const { pcm, sampleRate } = await tts.speak("Hello world!");
|
|
72
|
+
tts.destroy();
|
|
239
73
|
```
|
|
240
74
|
|
|
241
|
-
## Playing Audio
|
|
75
|
+
## Playing / Saving Audio
|
|
242
76
|
|
|
243
77
|
### Browser (Web Audio API)
|
|
244
78
|
|
|
245
79
|
```typescript
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
// Stop early
|
|
254
|
-
controller.stop();
|
|
255
|
-
|
|
256
|
-
// Wait for completion
|
|
257
|
-
await controller.onEnded;
|
|
80
|
+
const ctx = new AudioContext({ sampleRate });
|
|
81
|
+
const buffer = ctx.createBuffer(1, pcm.length, sampleRate);
|
|
82
|
+
buffer.copyToChannel(pcm, 0);
|
|
83
|
+
const source = ctx.createBufferSource();
|
|
84
|
+
source.buffer = buffer;
|
|
85
|
+
source.connect(ctx.destination);
|
|
86
|
+
source.start();
|
|
258
87
|
```
|
|
259
88
|
|
|
260
|
-
### Node
|
|
89
|
+
### Node (save to WAV)
|
|
261
90
|
|
|
262
91
|
```typescript
|
|
263
|
-
import { writeFileSync } from "fs";
|
|
264
|
-
import { execSync } from "child_process";
|
|
92
|
+
import { writeFileSync } from "node:fs";
|
|
265
93
|
|
|
266
|
-
const result = await gerbil.speak("Hello!");
|
|
267
|
-
|
|
268
|
-
// Convert to WAV
|
|
269
94
|
function saveWav(filename: string, audio: Float32Array, sampleRate: number) {
|
|
270
95
|
const buffer = Buffer.alloc(44 + audio.length * 2);
|
|
271
|
-
|
|
272
|
-
// WAV header
|
|
273
96
|
buffer.write("RIFF", 0);
|
|
274
97
|
buffer.writeUInt32LE(36 + audio.length * 2, 4);
|
|
275
98
|
buffer.write("WAVE", 8);
|
|
276
99
|
buffer.write("fmt ", 12);
|
|
277
100
|
buffer.writeUInt32LE(16, 16);
|
|
278
|
-
buffer.writeUInt16LE(1, 20);
|
|
279
|
-
buffer.writeUInt16LE(1, 22);
|
|
101
|
+
buffer.writeUInt16LE(1, 20); // PCM
|
|
102
|
+
buffer.writeUInt16LE(1, 22); // mono
|
|
280
103
|
buffer.writeUInt32LE(sampleRate, 24);
|
|
281
104
|
buffer.writeUInt32LE(sampleRate * 2, 28);
|
|
282
105
|
buffer.writeUInt16LE(2, 32);
|
|
283
106
|
buffer.writeUInt16LE(16, 34);
|
|
284
107
|
buffer.write("data", 36);
|
|
285
108
|
buffer.writeUInt32LE(audio.length * 2, 40);
|
|
286
|
-
|
|
287
|
-
// Audio data
|
|
288
109
|
for (let i = 0; i < audio.length; i++) {
|
|
289
110
|
const s = Math.max(-1, Math.min(1, audio[i]));
|
|
290
111
|
buffer.writeInt16LE(Math.round(s * 32767), 44 + i * 2);
|
|
291
112
|
}
|
|
292
|
-
|
|
293
113
|
writeFileSync(filename, buffer);
|
|
294
114
|
}
|
|
295
115
|
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
//
|
|
299
|
-
execSync("afplay output.wav");
|
|
300
|
-
|
|
301
|
-
// Play on Linux
|
|
302
|
-
// execSync("aplay output.wav");
|
|
116
|
+
const { pcm, sampleRate } = await engine.speak("Hello!");
|
|
117
|
+
saveWav("output.wav", pcm, sampleRate); // 22.05 kHz mono
|
|
118
|
+
// macOS: execSync("afplay output.wav");
|
|
303
119
|
```
|
|
304
120
|
|
|
305
|
-
##
|
|
306
|
-
|
|
307
|
-
For long text, stream audio chunks as they're generated:
|
|
308
|
-
|
|
309
|
-
```typescript
|
|
310
|
-
// Node.js streaming
|
|
311
|
-
for await (const chunk of gerbil.speakStream("Long paragraph of text...")) {
|
|
312
|
-
console.log(`Chunk ${chunk.index}: ${chunk.samples.length} samples`);
|
|
313
|
-
// Process/play chunk.samples
|
|
314
|
-
|
|
315
|
-
if (chunk.isFinal) {
|
|
316
|
-
console.log("Done!");
|
|
317
|
-
}
|
|
318
|
-
}
|
|
121
|
+
## Model
|
|
319
122
|
|
|
320
|
-
|
|
321
|
-
import { createAudioPlayer } from "@tryhamster/gerbil/browser";
|
|
123
|
+
### Kani-TTS-2
|
|
322
124
|
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
```
|
|
329
|
-
|
|
330
|
-
## useSpeech Hook Reference
|
|
331
|
-
|
|
332
|
-
```typescript
|
|
333
|
-
const {
|
|
334
|
-
// Actions
|
|
335
|
-
speak, // (text: string, opts?) => Promise<void>
|
|
336
|
-
stop, // () => void
|
|
337
|
-
load, // () => void (manual load trigger)
|
|
338
|
-
|
|
339
|
-
// State
|
|
340
|
-
isLoading, // boolean - model loading
|
|
341
|
-
isSpeaking, // boolean - currently speaking
|
|
342
|
-
isReady, // boolean - model ready
|
|
343
|
-
error, // string | null
|
|
344
|
-
|
|
345
|
-
// Voice control
|
|
346
|
-
listVoices, // () => VoiceInfo[] - voices for current model
|
|
347
|
-
currentVoice, // string
|
|
348
|
-
setVoice, // (id: string) => void
|
|
349
|
-
currentSpeed, // number
|
|
350
|
-
setSpeed, // (speed: number) => void
|
|
351
|
-
|
|
352
|
-
// Model info
|
|
353
|
-
currentModel, // TTSModelId - "kokoro-82m" | "supertonic-66m"
|
|
354
|
-
sampleRate, // number - 24000 (Kokoro) or 44100 (Supertonic)
|
|
355
|
-
|
|
356
|
-
// Loading progress
|
|
357
|
-
loadingProgress, // { status, file?, progress? }
|
|
358
|
-
} = useSpeech({
|
|
359
|
-
model: "kokoro-82m", // TTS model: "kokoro-82m" (default) or "supertonic-66m"
|
|
360
|
-
voice: "af_heart", // default voice (model-specific)
|
|
361
|
-
speed: 1.0, // default speed
|
|
362
|
-
autoLoad: false, // load on first speak() call
|
|
363
|
-
onReady: () => {},
|
|
364
|
-
onError: (err) => {},
|
|
365
|
-
onStart: () => {},
|
|
366
|
-
onEnd: () => {},
|
|
367
|
-
});
|
|
368
|
-
```
|
|
369
|
-
|
|
370
|
-
### Model-Specific Voices
|
|
371
|
-
|
|
372
|
-
Each model has its own set of voices:
|
|
373
|
-
|
|
374
|
-
```typescript
|
|
375
|
-
// Kokoro (default) - 28 voices
|
|
376
|
-
const { listVoices } = useSpeech(); // or model: "kokoro-82m"
|
|
377
|
-
// → af_heart, af_bella, am_fenrir, bf_emma, bm_george, ...
|
|
378
|
-
|
|
379
|
-
// Supertonic - 4 voices
|
|
380
|
-
const { listVoices } = useSpeech({ model: "supertonic-66m" });
|
|
381
|
-
// → F1, F2, M1, M2
|
|
382
|
-
```
|
|
383
|
-
|
|
384
|
-
**Note:** Voice IDs are model-specific. Using a Kokoro voice with Supertonic (or vice versa) will throw an error.
|
|
385
|
-
|
|
386
|
-
## Performance
|
|
387
|
-
|
|
388
|
-
| Metric | Value |
|
|
389
|
-
|--------|-------|
|
|
390
|
-
| Model size | ~330MB |
|
|
391
|
-
| First load | 3-10s (downloads model) |
|
|
392
|
-
| Cached load | <1s |
|
|
393
|
-
| Generation speed | ~8x realtime on M1 Mac |
|
|
394
|
-
| Sample rate | 24kHz |
|
|
395
|
-
| Audio format | Mono Float32 PCM |
|
|
396
|
-
|
|
397
|
-
## Troubleshooting
|
|
398
|
-
|
|
399
|
-
### Audio sounds like gibberish
|
|
400
|
-
|
|
401
|
-
This happens when using raw transformers.js without proper phoneme conversion. Gerbil uses `kokoro-js` which handles grapheme-to-phoneme (G2P) conversion automatically.
|
|
402
|
-
|
|
403
|
-
### Audio is too quiet
|
|
404
|
-
|
|
405
|
-
Kokoro output can be quiet. The audio is normalized by default, but if you're processing raw audio, ensure proper normalization:
|
|
406
|
-
|
|
407
|
-
```typescript
|
|
408
|
-
function normalizeAudio(audio: Float32Array, targetRms = 0.15): Float32Array {
|
|
409
|
-
const currentRms = Math.sqrt(audio.reduce((a, b) => a + b * b, 0) / audio.length);
|
|
410
|
-
const gain = targetRms / currentRms;
|
|
411
|
-
return audio.map(s => Math.max(-1, Math.min(1, s * gain)));
|
|
412
|
-
}
|
|
413
|
-
```
|
|
414
|
-
|
|
415
|
-
### Browser autoplay blocked
|
|
416
|
-
|
|
417
|
-
Browsers require user interaction before playing audio. Trigger speech from a click handler:
|
|
418
|
-
|
|
419
|
-
```tsx
|
|
420
|
-
<button onClick={() => speak("Hello!")}>Speak</button>
|
|
421
|
-
```
|
|
422
|
-
|
|
423
|
-
### Voice not found
|
|
424
|
-
|
|
425
|
-
Use `listVoices()` to see available voice IDs. Voice IDs follow the pattern `{language}{gender}_{name}`:
|
|
426
|
-
- `af_` = American female
|
|
427
|
-
- `am_` = American male
|
|
428
|
-
- `bf_` = British female
|
|
429
|
-
- `bm_` = British male
|
|
430
|
-
|
|
431
|
-
## Model Info
|
|
432
|
-
|
|
433
|
-
### Kokoro-82M
|
|
434
|
-
|
|
435
|
-
Uses [Kokoro-82M](https://huggingface.co/hexgrad/Kokoro-82M) via the [ONNX export](https://huggingface.co/onnx-community/Kokoro-82M-v1.0-ONNX) and [kokoro-js](https://www.npmjs.com/package/kokoro-js) wrapper.
|
|
436
|
-
|
|
437
|
-
- **Architecture:** StyleTTS2-based
|
|
438
|
-
- **Parameters:** 82M
|
|
439
|
-
- **Languages:** English (US + UK)
|
|
440
|
-
- **License:** Apache 2.0
|
|
441
|
-
|
|
442
|
-
### Supertonic-66M
|
|
443
|
-
|
|
444
|
-
Uses [Supertonic-TTS](https://huggingface.co/onnx-community/Supertonic-TTS-ONNX) via transformers.js pipeline.
|
|
445
|
-
|
|
446
|
-
- **Parameters:** 66M
|
|
447
|
-
- **Languages:** English
|
|
448
|
-
- **Sample Rate:** 44.1kHz (HiFi)
|
|
449
|
-
- **Speed:** ~167x realtime
|
|
450
|
-
- **License:** Apache 2.0
|
|
125
|
+
- **Backbone:** LFM2-350M codec-LM (architecture `KaniTTS2ForCausalLM`)
|
|
126
|
+
- **Codec:** NVIDIA NeMo NanoCodec (FSQ + causal HiFi-GAN), validated bit-exact
|
|
127
|
+
- **Sample rate:** 22.05 kHz
|
|
128
|
+
- **Repos:** `nineninesix/kani-tts-2-en`. License varies by variant (the `kani-tts-2-en`
|
|
129
|
+
checkpoint is LFM1.0/other; a 450M variant is Apache 2.0).
|
|
451
130
|
|
|
452
131
|
---
|
|
453
132
|
|
|
454
|
-
##
|
|
133
|
+
## `Gerbil`-class `speak()` (native wrapper)
|
|
455
134
|
|
|
456
|
-
|
|
457
|
-
|
|
458
|
-
|
|
459
|
-
|
|
460
|
-
|
|
135
|
+
> The Kokoro-82M / Supertonic-66M ONNX/transformers.js TTS lane has been removed. The
|
|
136
|
+
> `Gerbil`-class `speak()` method below now runs the native Kani-TTS-2 engine under the hood
|
|
137
|
+
> (it requires WebGPU and returns 22.05 kHz PCM). The browser `useSpeech` hook is gone — use
|
|
138
|
+
> `useTTS` from `@tryhamster/gerbil/gpu/hooks`. The AI SDK `gerbil.speech()` provider also
|
|
139
|
+
> routes through this native path.
|
|
461
140
|
|
|
462
141
|
```typescript
|
|
463
|
-
|
|
464
|
-
export const MY_MODEL_VOICES: VoiceInfo[] = [
|
|
465
|
-
{ id: "voice1", name: "Voice 1", gender: "female", language: "en", description: "..." },
|
|
466
|
-
// ...
|
|
467
|
-
];
|
|
468
|
-
|
|
469
|
-
// 2. Add to TTS_MODELS registry
|
|
470
|
-
export const TTS_MODELS: Record<string, TTSModelConfig> = {
|
|
471
|
-
// ...existing models...
|
|
472
|
-
"my-model-50m": {
|
|
473
|
-
id: "my-model-50m",
|
|
474
|
-
repo: "hf-org/my-model-onnx",
|
|
475
|
-
description: "My Model 50M - Description",
|
|
476
|
-
size: "~150MB",
|
|
477
|
-
sampleRate: 22050,
|
|
478
|
-
voices: MY_MODEL_VOICES,
|
|
479
|
-
defaultVoice: "voice1",
|
|
480
|
-
languages: ["en"],
|
|
481
|
-
},
|
|
482
|
-
};
|
|
483
|
-
|
|
484
|
-
// 3. Create TTS class (see KokoroTTS or SupertonicTTS as examples)
|
|
485
|
-
export class MyModelTTS {
|
|
486
|
-
async load(options: LoadTTSOptions): Promise<void> { /* ... */ }
|
|
487
|
-
async speak(text: string, options: SpeakOptions): Promise<SpeakResult> { /* ... */ }
|
|
488
|
-
listVoices(): VoiceInfo[] { /* ... */ }
|
|
489
|
-
// ...
|
|
490
|
-
}
|
|
491
|
-
|
|
492
|
-
// 4. Update createTTS factory
|
|
493
|
-
export function createTTS(modelId: string = "kokoro-82m"): TTSBackend {
|
|
494
|
-
if (modelId.startsWith("my-model")) {
|
|
495
|
-
return new MyModelTTS(modelId);
|
|
496
|
-
}
|
|
497
|
-
// ...existing logic...
|
|
498
|
-
}
|
|
499
|
-
```
|
|
500
|
-
|
|
501
|
-
### 2. Browser Hook (`src/browser/index.ts`)
|
|
502
|
-
|
|
503
|
-
Add browser support in the useSpeech hook:
|
|
504
|
-
|
|
505
|
-
```typescript
|
|
506
|
-
// 1. Add voice definitions for browser
|
|
507
|
-
const MY_MODEL_BROWSER_VOICES: BrowserVoiceInfo[] = [
|
|
508
|
-
{ id: "voice1", name: "Voice 1", gender: "female", language: "en", description: "..." },
|
|
509
|
-
];
|
|
510
|
-
|
|
511
|
-
// 2. Update TTS_MODELS in browser
|
|
512
|
-
const TTS_MODELS: Record<TTSModelId, {...}> = {
|
|
513
|
-
// ...existing models...
|
|
514
|
-
"my-model-50m": {
|
|
515
|
-
repo: "hf-org/my-model-onnx",
|
|
516
|
-
defaultVoice: "voice1",
|
|
517
|
-
sampleRate: 22050,
|
|
518
|
-
voices: MY_MODEL_BROWSER_VOICES,
|
|
519
|
-
},
|
|
520
|
-
};
|
|
521
|
-
|
|
522
|
-
// 3. Update TTSModelId type
|
|
523
|
-
export type TTSModelId = "kokoro-82m" | "supertonic-66m" | "my-model-50m";
|
|
524
|
-
|
|
525
|
-
// 4. Add loading logic in useSpeech useEffect
|
|
526
|
-
if (modelId === "my-model-50m") {
|
|
527
|
-
// Load using appropriate method (pipeline, custom loader, etc.)
|
|
528
|
-
}
|
|
529
|
-
```
|
|
530
|
-
|
|
531
|
-
### 3. Gerbil Class (`src/core/gerbil.ts`)
|
|
532
|
-
|
|
533
|
-
Update the main Gerbil class to support the new model:
|
|
534
|
-
|
|
535
|
-
```typescript
|
|
536
|
-
// Add type import
|
|
537
|
-
type MyModelTTSType = import("./tts.js").MyModelTTS;
|
|
142
|
+
import { Gerbil } from "@tryhamster/gerbil";
|
|
538
143
|
|
|
539
|
-
|
|
540
|
-
|
|
144
|
+
const g = new Gerbil();
|
|
145
|
+
const result = await g.speak("Hello!");
|
|
146
|
+
// result.audio = Float32Array, result.sampleRate = 22050 (native Kani-TTS-2)
|
|
541
147
|
```
|
|
542
|
-
|
|
543
|
-
### 4. Integration Points
|
|
544
|
-
|
|
545
|
-
Update these integration files if applicable:
|
|
546
|
-
|
|
547
|
-
- **AI SDK** (`src/integrations/ai-sdk.ts`): Add speech model support
|
|
548
|
-
- **Skills** (`src/skills/builtin/speak.ts`, etc.): Update voice enums
|
|
549
|
-
- **CLI** (`src/cli/index.ts`): Add model to CLI options
|
|
550
|
-
|
|
551
|
-
### 5. Documentation
|
|
552
|
-
|
|
553
|
-
Update documentation:
|
|
554
|
-
|
|
555
|
-
- `docs/tts.md`: Add model to tables and examples
|
|
556
|
-
- `.cursor/rules/gerbil/tts.mdc`: Update implementation patterns
|
|
557
|
-
|
|
558
|
-
### File Summary
|
|
559
|
-
|
|
560
|
-
| File | Purpose |
|
|
561
|
-
|------|---------|
|
|
562
|
-
| `src/core/tts.ts` | Core TTS classes, voice registry, model config |
|
|
563
|
-
| `src/browser/index.ts` | React hooks (useSpeech, useVoiceChat) |
|
|
564
|
-
| `src/core/gerbil.ts` | Main Gerbil class integration |
|
|
565
|
-
| `src/integrations/ai-sdk.ts` | AI SDK v5 speech model |
|
|
566
|
-
| `src/skills/builtin/speak.ts` | Speak skill voice options |
|
|
567
|
-
| `docs/tts.md` | User documentation |
|
|
568
|
-
| `.cursor/rules/gerbil/tts.mdc` | Developer patterns |
|
|
569
|
-
|