@tryhamster/gerbil 1.0.0-rc.8 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (179) hide show
  1. package/LICENSE +1 -1
  2. package/README.md +247 -84
  3. package/dist/architectures-C1I5V3Dt.mjs +6070 -0
  4. package/dist/architectures-C1I5V3Dt.mjs.map +1 -0
  5. package/dist/browser/index.d.ts +264 -588
  6. package/dist/browser/index.d.ts.map +1 -1
  7. package/dist/browser/index.js +585 -2334
  8. package/dist/browser/index.js.map +1 -1
  9. package/dist/cli.mjs +625 -1098
  10. package/dist/cli.mjs.map +1 -1
  11. package/dist/defaults-9komdrbY.mjs +24 -0
  12. package/dist/defaults-9komdrbY.mjs.map +1 -0
  13. package/dist/frameworks/express.d.mts +1 -3
  14. package/dist/frameworks/express.d.mts.map +1 -1
  15. package/dist/frameworks/express.mjs +7 -7
  16. package/dist/frameworks/express.mjs.map +1 -1
  17. package/dist/frameworks/fastify.d.mts +1 -1
  18. package/dist/frameworks/fastify.d.mts.map +1 -1
  19. package/dist/frameworks/fastify.mjs +3 -3
  20. package/dist/frameworks/fastify.mjs.map +1 -1
  21. package/dist/frameworks/hono.d.mts +1 -1
  22. package/dist/frameworks/hono.d.mts.map +1 -1
  23. package/dist/frameworks/hono.mjs +4 -4
  24. package/dist/frameworks/hono.mjs.map +1 -1
  25. package/dist/frameworks/next.d.mts +3 -2
  26. package/dist/frameworks/next.d.mts.map +1 -1
  27. package/dist/frameworks/next.mjs +4 -4
  28. package/dist/frameworks/next.mjs.map +1 -1
  29. package/dist/frameworks/react.d.mts +1 -1
  30. package/dist/frameworks/trpc.d.mts +1 -1
  31. package/dist/frameworks/trpc.d.mts.map +1 -1
  32. package/dist/frameworks/trpc.mjs +4 -4
  33. package/dist/frameworks/trpc.mjs.map +1 -1
  34. package/dist/gerbil-BHrJJIa4.mjs +1656 -0
  35. package/dist/gerbil-BHrJJIa4.mjs.map +1 -0
  36. package/dist/gerbil-BT9fCydo.d.mts +488 -0
  37. package/dist/gerbil-BT9fCydo.d.mts.map +1 -0
  38. package/dist/gerbil-DomNfIr1.mjs +4 -0
  39. package/dist/gpu/hooks.d.mts +520 -0
  40. package/dist/gpu/hooks.d.mts.map +1 -0
  41. package/dist/gpu/hooks.mjs +1188 -0
  42. package/dist/gpu/hooks.mjs.map +1 -0
  43. package/dist/gpu/index.d.mts +2 -0
  44. package/dist/gpu/index.mjs +6 -0
  45. package/dist/gpu-33qCAtHW.mjs +3615 -0
  46. package/dist/gpu-33qCAtHW.mjs.map +1 -0
  47. package/dist/index-Dgmb2kE3.d.mts +245 -0
  48. package/dist/index-Dgmb2kE3.d.mts.map +1 -0
  49. package/dist/index-jEAL2s-A.d.mts +2022 -0
  50. package/dist/index-jEAL2s-A.d.mts.map +1 -0
  51. package/dist/index.d.mts +22 -487
  52. package/dist/index.d.mts.map +1 -1
  53. package/dist/index.mjs +13 -8
  54. package/dist/index.mjs.map +1 -1
  55. package/dist/indexeddb-store-BWIMtxxH.mjs +103 -0
  56. package/dist/indexeddb-store-BWIMtxxH.mjs.map +1 -0
  57. package/dist/indexeddb-store-ClH12Xnl.mjs +4 -0
  58. package/dist/integrations/ai-sdk.d.mts +75 -6
  59. package/dist/integrations/ai-sdk.d.mts.map +1 -1
  60. package/dist/integrations/ai-sdk.mjs +131 -15
  61. package/dist/integrations/ai-sdk.mjs.map +1 -1
  62. package/dist/integrations/langchain.d.mts +1 -1
  63. package/dist/integrations/langchain.d.mts.map +1 -1
  64. package/dist/integrations/langchain.mjs +5 -5
  65. package/dist/integrations/langchain.mjs.map +1 -1
  66. package/dist/integrations/llamaindex.d.mts +1 -1
  67. package/dist/integrations/llamaindex.d.mts.map +1 -1
  68. package/dist/integrations/llamaindex.mjs +5 -5
  69. package/dist/integrations/llamaindex.mjs.map +1 -1
  70. package/dist/integrations/mcp-client.mjs +3 -3
  71. package/dist/integrations/mcp-client.mjs.map +1 -1
  72. package/dist/integrations/mcp.d.mts +3 -2
  73. package/dist/integrations/mcp.d.mts.map +1 -1
  74. package/dist/integrations/mcp.mjs +5 -5
  75. package/dist/{mcp-BvbriaBy.mjs → mcp-1DaMsaBc.mjs} +4 -4
  76. package/dist/mcp-1DaMsaBc.mjs.map +1 -0
  77. package/dist/memory/index.d.mts +3 -0
  78. package/dist/memory/index.mjs +6 -0
  79. package/dist/memory-D1P7Tmda.mjs +4 -0
  80. package/dist/memory-DVN0MnIG.mjs +132 -0
  81. package/dist/memory-DVN0MnIG.mjs.map +1 -0
  82. package/dist/memory-Dj0J1v88.mjs +294 -0
  83. package/dist/memory-Dj0J1v88.mjs.map +1 -0
  84. package/dist/moonshine-stt-BLyVoRpB.mjs +4 -0
  85. package/dist/moonshine-stt-v_P_Ci_m.mjs +11936 -0
  86. package/dist/moonshine-stt-v_P_Ci_m.mjs.map +1 -0
  87. package/dist/{one-liner-s-lD8rCC.mjs → one-liner-DnQn7HJK.mjs} +14 -16
  88. package/dist/one-liner-DnQn7HJK.mjs.map +1 -0
  89. package/dist/repl-jV5gcJFA.mjs +9 -0
  90. package/dist/skills/index.d.mts +270 -320
  91. package/dist/skills/index.d.mts.map +1 -1
  92. package/dist/skills/index.mjs +5 -5
  93. package/dist/{skills-CD3Orlex.mjs → skills-DX8D59UH.mjs} +187 -32
  94. package/dist/skills-DX8D59UH.mjs.map +1 -0
  95. package/dist/{tools-Bi1P7Xoy.mjs → tools-DQ1mPUw5.mjs} +34 -22
  96. package/dist/tools-DQ1mPUw5.mjs.map +1 -0
  97. package/dist/{types-CiTc7ez3.d.mts → types-D6FiR_oh.d.mts} +106 -12
  98. package/dist/types-D6FiR_oh.d.mts.map +1 -0
  99. package/dist/types-DQBe2lFo.d.mts +165 -0
  100. package/dist/types-DQBe2lFo.d.mts.map +1 -0
  101. package/dist/{utils-CZBZ8dgR.mjs → utils-DKO55ZmZ.mjs} +1 -1
  102. package/dist/{utils-CZBZ8dgR.mjs.map → utils-DKO55ZmZ.mjs.map} +1 -1
  103. package/dist/vector-B0panuy6.mjs +95 -0
  104. package/dist/vector-B0panuy6.mjs.map +1 -0
  105. package/docs/PROJECT-STATE.md +321 -0
  106. package/docs/adding-a-model-family.md +280 -0
  107. package/docs/ai-sdk.md +70 -61
  108. package/docs/architecture/overview.md +17 -7
  109. package/docs/browser.md +203 -8
  110. package/docs/embeddings.md +156 -0
  111. package/docs/gerbil-site-native-migration.md +217 -0
  112. package/docs/gpu-engine/architectures.md +398 -0
  113. package/docs/gpu-engine/ir.md +372 -0
  114. package/docs/gpu-engine/kernels.md +718 -0
  115. package/docs/gpu-engine/paper.html +1759 -0
  116. package/docs/gpu-engine/paper.md +2109 -0
  117. package/docs/gpu-engine/safetensors.md +312 -0
  118. package/docs/gpu-engine/tokenizer.md +302 -0
  119. package/docs/memory-rag.md +91 -0
  120. package/docs/metal-safari-intel.md +190 -0
  121. package/docs/mobile-failure-diagnosis.md +124 -0
  122. package/docs/mobile.md +99 -0
  123. package/docs/observability.md +230 -0
  124. package/docs/onnx-removal-plan.md +339 -0
  125. package/docs/research/autoresearch-portable.md +904 -0
  126. package/docs/research/dispatch-reduction-hivemind.md +84 -0
  127. package/docs/research/ios-safari-model-caching.md +117 -0
  128. package/docs/research/mobile-webgpu-speed-fusion.md +135 -0
  129. package/docs/research/native-stt-model-selection.md +49 -0
  130. package/docs/research/native-tts-model-selection.md +90 -0
  131. package/docs/research/native-vs-chromium-decision.md +152 -0
  132. package/docs/research/nemotron-mamba2-inference.md +910 -0
  133. package/docs/research/qwen35-multimodal.md +293 -0
  134. package/docs/research/qwen36-gemma4-targets.md +337 -0
  135. package/docs/research/sota-embedding-models.md +179 -0
  136. package/docs/research/sota-mobile-models-2026.md +263 -0
  137. package/docs/research/sota-modality-models.md +202 -0
  138. package/docs/research/tps-baselines.md +71 -0
  139. package/docs/research/webgpu-m4-reference.md +104 -0
  140. package/docs/site-update-plan.md +155 -0
  141. package/docs/structured-output.md +123 -0
  142. package/docs/stt.md +63 -446
  143. package/docs/tts.md +77 -499
  144. package/docs/vision.md +100 -338
  145. package/package.json +22 -7
  146. package/dist/chrome-backend-CORwaIyC.mjs +0 -1212
  147. package/dist/chrome-backend-CORwaIyC.mjs.map +0 -1
  148. package/dist/chrome-backend-DIKYoWj-.mjs +0 -3
  149. package/dist/gerbil-CJ3ifloF.mjs +0 -4
  150. package/dist/gerbil-Dw4Qj77e.mjs +0 -1631
  151. package/dist/gerbil-Dw4Qj77e.mjs.map +0 -1
  152. package/dist/gerbil-qOTe1nl2.d.mts +0 -431
  153. package/dist/gerbil-qOTe1nl2.d.mts.map +0 -1
  154. package/dist/kokoro-BNTb6egA.mjs +0 -20210
  155. package/dist/kokoro-BNTb6egA.mjs.map +0 -1
  156. package/dist/kokoro-DFRQ1OeM.js +0 -20212
  157. package/dist/kokoro-DFRQ1OeM.js.map +0 -1
  158. package/dist/mcp-BvbriaBy.mjs.map +0 -1
  159. package/dist/one-liner-s-lD8rCC.mjs.map +0 -1
  160. package/dist/repl-DveXw36T.mjs +0 -9
  161. package/dist/skills-CD3Orlex.mjs.map +0 -1
  162. package/dist/stt-CpLYbGFd.mjs +0 -433
  163. package/dist/stt-CpLYbGFd.mjs.map +0 -1
  164. package/dist/stt-DRPLEEHB.mjs +0 -3
  165. package/dist/stt-Te8Qz-Ay.js +0 -433
  166. package/dist/stt-Te8Qz-Ay.js.map +0 -1
  167. package/dist/tools-Bi1P7Xoy.mjs.map +0 -1
  168. package/dist/transformers.web-DokyH3rP.js +0 -3
  169. package/dist/transformers.web-M6mCnEYJ.js +0 -30382
  170. package/dist/transformers.web-M6mCnEYJ.js.map +0 -1
  171. package/dist/tts-C0xx3CtE.js +0 -724
  172. package/dist/tts-C0xx3CtE.js.map +0 -1
  173. package/dist/tts-DXgsKGCe.mjs +0 -3
  174. package/dist/tts-DeGANMNV.mjs +0 -730
  175. package/dist/tts-DeGANMNV.mjs.map +0 -1
  176. package/dist/types-CiTc7ez3.d.mts.map +0 -1
  177. /package/dist/{auto-update-S9s5-g0C.mjs → auto-update-BVaLXcDE.mjs} +0 -0
  178. /package/dist/{chunk-CkXuGtQK.mjs → chunk-B9cbKln6.mjs} +0 -0
  179. /package/dist/{microphone-DaMZFRuR.mjs → microphone-Bqmoz9_K.mjs} +0 -0
package/docs/tts.md CHANGED
@@ -1,569 +1,147 @@
1
1
  # Text-to-Speech
2
2
 
3
- Gerbil provides on-device text-to-speech using the [Kokoro-82M](https://huggingface.co/hexgrad/Kokoro-82M) model. Generate natural-sounding speech locally without API keys or internet connection.
3
+ Gerbil's native WebGPU engine synthesizes speech with **Kani-TTS-2** an LFM2-350M
4
+ codec-LM backbone driving NVIDIA's NeMo **NanoCodec** decoder (FSQ + causal HiFi-GAN).
5
+ `engine.speak()` returns **22.05 kHz mono PCM**, fully on-device, no ONNX.
4
6
 
5
- ## Quick Start
6
-
7
- ### Node.js / CLI
8
-
9
- ```typescript
10
- import { Gerbil } from "@tryhamster/gerbil";
11
-
12
- const g = new Gerbil();
13
-
14
- // Generate speech
15
- const result = await g.speak("Hello, I'm Gerbil!", {
16
- voice: "af_heart", // American female (highest quality)
17
- speed: 1.0
18
- });
7
+ > **Pre-1.0.** Kani-TTS-2 is the only TTS path. The old Kokoro / Supertonic ONNX /
8
+ > transformers.js lane has been removed. The `Gerbil`-class `speak()` method still works for
9
+ > backward compatibility but is now a thin wrapper over the native engine (see
10
+ > [below](#gerbil-class-speak-native-wrapper)).
19
11
 
20
- // result.audio = Float32Array (PCM samples)
21
- // result.sampleRate = 24000
22
- // result.duration = seconds
23
- ```
24
-
25
- ### React (Browser)
26
-
27
- ```tsx
28
- import { useSpeech } from "@tryhamster/gerbil/browser";
29
-
30
- function SpeechDemo() {
31
- // Default: Kokoro TTS (24kHz, 28 voices)
32
- const { speak, stop, isSpeaking, isLoading, listVoices, setVoice } = useSpeech();
33
-
34
- // Or use Supertonic (44.1kHz, 4 voices, faster)
35
- // const { speak, listVoices } = useSpeech({ model: "supertonic-66m" });
36
-
37
- if (isLoading) return <div>Loading TTS model...</div>;
38
-
39
- return (
40
- <div>
41
- <select onChange={e => setVoice(e.target.value)}>
42
- {listVoices().map(v => (
43
- <option key={v.id} value={v.id}>{v.name} ({v.language})</option>
44
- ))}
45
- </select>
46
-
47
- <button onClick={() => speak("Hello world!")}>
48
- {isSpeaking ? "Speaking..." : "Speak"}
49
- </button>
50
-
51
- {isSpeaking && <button onClick={stop}>Stop</button>}
52
- </div>
53
- );
54
- }
55
- ```
12
+ ## Quick Start
56
13
 
57
- ### AI SDK
14
+ ### Node
58
15
 
59
16
  ```typescript
60
- import { experimental_generateSpeech as generateSpeech } from "ai";
61
- import { gerbil } from "@tryhamster/gerbil/ai";
62
-
63
- const audio = await generateSpeech({
64
- model: gerbil.speech(),
65
- text: "Hello from Gerbil!",
66
- voice: "bf_emma", // British female
67
- });
68
-
69
- // audio.audioData = Uint8Array (WAV format)
70
- ```
17
+ import { WebGPUEngine } from "@tryhamster/gerbil/gpu";
71
18
 
72
- ### CLI
19
+ const engine = await WebGPUEngine.create({ repo: "nineninesix/kani-tts-2-en" });
73
20
 
74
- ```bash
75
- # Speak text with default voice
76
- gerbil speak "Hello world"
21
+ const { pcm, sampleRate, audioSeconds } = await engine.speak("Hello, I'm Gerbil!");
22
+ // pcm: Float32Array in [-1, 1], sampleRate === 22050
23
+ console.log(`${audioSeconds.toFixed(2)}s of audio`);
77
24
 
78
- # Specify voice and speed
79
- gerbil speak --voice bf_emma --speed 1.2 "Cheerio!"
25
+ engine.destroy();
80
26
  ```
81
27
 
82
- ### Skills
28
+ ### Options
83
29
 
84
30
  ```typescript
85
- import { speak, announce, readAloud } from "@tryhamster/gerbil/skills";
86
-
87
- // Simple speech
88
- await speak({ text: "Hello world", voice: "af_heart" });
89
-
90
- // AI-crafted announcement
91
- await announce({
92
- message: "Build completed successfully",
93
- style: "excited", // casual, formal, excited, calm, urgent
94
- voice: "af_bella"
95
- });
96
-
97
- // Read file aloud
98
- await readAloud({
99
- content: "./README.md",
100
- voice: "bf_emma",
101
- summarizeIfLong: true // Summarize if > 5000 chars
31
+ const result = await engine.speak("Speak with feeling.", {
32
+ languageTag: "en_us", // prepended as "{tag}: {text}" (default "en_us")
33
+ temperature: 1.0, // sampling temperature (default 1.0)
34
+ topP: 0.95, // nucleus threshold (default 0.95)
35
+ repetitionPenalty: 1.1, // (default 1.1)
36
+ maxFrames: 2000, // cap audio length (default: unbounded up to maxSeqLen)
102
37
  });
103
38
  ```
104
39
 
105
- ## Available TTS Models
106
-
107
- Gerbil supports multiple TTS backends:
108
-
109
- | Model | Size | Sample Rate | Voices | Speed | Notes |
110
- |-------|------|-------------|--------|-------|-------|
111
- | `kokoro-82m` | ~330MB | 24000 Hz | 28 | ~100x RT | Default, highest quality |
112
- | `supertonic-66m` | ~250MB | 44100 Hz | 4 | ~167x RT | Faster, HiFi output |
113
-
114
- ### Selecting a Model
115
-
116
- ```typescript
117
- // Use default (Kokoro)
118
- await g.loadTTS();
119
-
120
- // Use Supertonic (faster, higher sample rate)
121
- await g.loadTTS({ model: "supertonic-66m" });
122
-
123
- // List available models
124
- const models = await g.listTTSModels();
125
- // [{ id: "kokoro-82m", sampleRate: 24000, voiceCount: 28 }, ...]
126
- ```
127
-
128
- ## Available Voices
129
-
130
- ### Kokoro Voices
131
-
132
- Kokoro provides 28 voices across American and British English:
133
-
134
- ### American English - Female (Recommended)
40
+ `speak()` requires a Kani-TTS-2 checkpoint (architecture `KaniTTS2ForCausalLM`). The
41
+ NanoCodec decoder checkpoint is downloaded lazily on first `speak()` call.
135
42
 
136
- | Voice ID | Name | Quality | Description |
137
- |----------|------|---------|-------------|
138
- | `af_heart` | Heart | A | Highest quality, warm tone |
139
- | `af_bella` | Bella | A- | Warm and friendly |
140
- | `af_nicole` | Nicole | B- | Soft and gentle |
141
- | `af_sarah` | Sarah | C+ | Clear and professional |
142
-
143
- ### American English - Male
144
-
145
- | Voice ID | Name | Quality | Description |
146
- |----------|------|---------|-------------|
147
- | `am_fenrir` | Fenrir | C+ | Best male quality |
148
- | `am_michael` | Michael | C+ | Warm and friendly |
149
- | `am_puck` | Puck | C+ | Neutral tone |
150
-
151
- ### British English - Female
152
-
153
- | Voice ID | Name | Quality | Description |
154
- |----------|------|---------|-------------|
155
- | `bf_emma` | Emma | B- | Elegant and clear |
156
- | `bf_isabella` | Isabella | C | Sophisticated |
157
-
158
- ### British English - Male
159
-
160
- | Voice ID | Name | Quality | Description |
161
- |----------|------|---------|-------------|
162
- | `bm_george` | George | C | Distinguished |
163
- | `bm_fable` | Fable | C | Storyteller tone |
164
-
165
- > **Tip:** `af_heart` is the highest quality voice. Start there and experiment with others.
166
-
167
- ### Supertonic Voices
168
-
169
- Supertonic provides 4 high-quality voices:
170
-
171
- | Voice | Gender | Description |
172
- |-------|--------|-------------|
173
- | `F1` | Female | Clear and natural |
174
- | `F2` | Female | Warm and expressive |
175
- | `M1` | Male | Deep and confident |
176
- | `M2` | Male | Friendly and casual |
177
-
178
- ## API Reference
179
-
180
- ### Gerbil Class Methods
43
+ ## Result
181
44
 
182
45
  ```typescript
183
- class Gerbil {
184
- // Load TTS model (auto-called by speak if needed)
185
- async loadTTS(options?: LoadTTSOptions): Promise<void>;
186
-
187
- // Check if TTS is loaded
188
- isTTSLoaded(): boolean;
189
-
190
- // Generate speech
191
- async speak(text: string, options?: SpeakOptions): Promise<SpeakResult>;
192
-
193
- // Stream speech (yields chunks as generated)
194
- async *speakStream(text: string, options?: SpeakOptions): AsyncGenerator<AudioChunk>;
195
-
196
- // Get available voices
197
- listVoices(): VoiceInfo[];
198
- getVoice(voiceId: string): VoiceInfo | null;
46
+ interface SpeakResult {
47
+ /** Mono PCM in [-1, 1]. */
48
+ pcm: Float32Array;
49
+ /** Sample rate — always 22050. */
50
+ sampleRate: number;
51
+ /** Number of audio frames decoded. */
52
+ frames: number;
53
+ /** Audio duration in seconds. */
54
+ audioSeconds: number;
199
55
  }
200
56
  ```
201
57
 
202
- ### SpeakOptions
58
+ ## Standalone KaniTTS
59
+
60
+ `engine.speak()` lazily builds a `KaniTTS` engine under the hood. You can also use it
61
+ directly (e.g. without a separate text model):
203
62
 
204
63
  ```typescript
205
- interface SpeakOptions {
206
- /** Voice ID (default: "af_heart") */
207
- voice?: string;
208
-
209
- /** Speed multiplier 0.5-2.0 (default: 1.0) */
210
- speed?: number;
211
-
212
- /** Progress callback during loading */
213
- onProgress?: (info: ProgressInfo) => void;
214
-
215
- /** Callback for each audio chunk (streaming) */
216
- onAudioChunk?: (chunk: AudioChunk) => void;
217
- }
218
- ```
64
+ import { KaniTTS } from "@tryhamster/gerbil/gpu";
219
65
 
220
- ### SpeakResult
66
+ const tts = await KaniTTS.create({
67
+ repo: "nineninesix/kani-tts-2-en",
68
+ // codecRepo defaults to the NeMo 22 kHz NanoCodec MLX checkpoint
69
+ });
221
70
 
222
- ```typescript
223
- interface SpeakResult {
224
- /** PCM audio samples (mono, float32, -1 to 1) */
225
- audio: Float32Array;
226
-
227
- /** Sample rate (always 24000 for Kokoro) */
228
- sampleRate: number;
229
-
230
- /** Duration in seconds */
231
- duration: number;
232
-
233
- /** Voice ID used */
234
- voice: string;
235
-
236
- /** Generation time in milliseconds */
237
- totalTime: number;
238
- }
71
+ const { pcm, sampleRate } = await tts.speak("Hello world!");
72
+ tts.destroy();
239
73
  ```
240
74
 
241
- ## Playing Audio
75
+ ## Playing / Saving Audio
242
76
 
243
77
  ### Browser (Web Audio API)
244
78
 
245
79
  ```typescript
246
- import { playAudio } from "@tryhamster/gerbil/browser";
247
-
248
- const result = await gerbil.speak("Hello!");
249
-
250
- // One-liner playback
251
- const controller = await playAudio(result.audio, result.sampleRate);
252
-
253
- // Stop early
254
- controller.stop();
255
-
256
- // Wait for completion
257
- await controller.onEnded;
80
+ const ctx = new AudioContext({ sampleRate });
81
+ const buffer = ctx.createBuffer(1, pcm.length, sampleRate);
82
+ buffer.copyToChannel(pcm, 0);
83
+ const source = ctx.createBufferSource();
84
+ source.buffer = buffer;
85
+ source.connect(ctx.destination);
86
+ source.start();
258
87
  ```
259
88
 
260
- ### Node.js (Save to File)
89
+ ### Node (save to WAV)
261
90
 
262
91
  ```typescript
263
- import { writeFileSync } from "fs";
264
- import { execSync } from "child_process";
92
+ import { writeFileSync } from "node:fs";
265
93
 
266
- const result = await gerbil.speak("Hello!");
267
-
268
- // Convert to WAV
269
94
  function saveWav(filename: string, audio: Float32Array, sampleRate: number) {
270
95
  const buffer = Buffer.alloc(44 + audio.length * 2);
271
-
272
- // WAV header
273
96
  buffer.write("RIFF", 0);
274
97
  buffer.writeUInt32LE(36 + audio.length * 2, 4);
275
98
  buffer.write("WAVE", 8);
276
99
  buffer.write("fmt ", 12);
277
100
  buffer.writeUInt32LE(16, 16);
278
- buffer.writeUInt16LE(1, 20); // PCM
279
- buffer.writeUInt16LE(1, 22); // Mono
101
+ buffer.writeUInt16LE(1, 20); // PCM
102
+ buffer.writeUInt16LE(1, 22); // mono
280
103
  buffer.writeUInt32LE(sampleRate, 24);
281
104
  buffer.writeUInt32LE(sampleRate * 2, 28);
282
105
  buffer.writeUInt16LE(2, 32);
283
106
  buffer.writeUInt16LE(16, 34);
284
107
  buffer.write("data", 36);
285
108
  buffer.writeUInt32LE(audio.length * 2, 40);
286
-
287
- // Audio data
288
109
  for (let i = 0; i < audio.length; i++) {
289
110
  const s = Math.max(-1, Math.min(1, audio[i]));
290
111
  buffer.writeInt16LE(Math.round(s * 32767), 44 + i * 2);
291
112
  }
292
-
293
113
  writeFileSync(filename, buffer);
294
114
  }
295
115
 
296
- saveWav("output.wav", result.audio, result.sampleRate);
297
-
298
- // Play on macOS
299
- execSync("afplay output.wav");
300
-
301
- // Play on Linux
302
- // execSync("aplay output.wav");
116
+ const { pcm, sampleRate } = await engine.speak("Hello!");
117
+ saveWav("output.wav", pcm, sampleRate); // 22.05 kHz mono
118
+ // macOS: execSync("afplay output.wav");
303
119
  ```
304
120
 
305
- ## Streaming Audio
306
-
307
- For long text, stream audio chunks as they're generated:
308
-
309
- ```typescript
310
- // Node.js streaming
311
- for await (const chunk of gerbil.speakStream("Long paragraph of text...")) {
312
- console.log(`Chunk ${chunk.index}: ${chunk.samples.length} samples`);
313
- // Process/play chunk.samples
314
-
315
- if (chunk.isFinal) {
316
- console.log("Done!");
317
- }
318
- }
121
+ ## Model
319
122
 
320
- // Browser streaming with seamless playback
321
- import { createAudioPlayer } from "@tryhamster/gerbil/browser";
123
+ ### Kani-TTS-2
322
124
 
323
- const player = createAudioPlayer(24000);
324
-
325
- for await (const chunk of gerbil.speakStream("Long text...")) {
326
- player.queue(chunk.samples);
327
- }
328
- ```
329
-
330
- ## useSpeech Hook Reference
331
-
332
- ```typescript
333
- const {
334
- // Actions
335
- speak, // (text: string, opts?) => Promise<void>
336
- stop, // () => void
337
- load, // () => void (manual load trigger)
338
-
339
- // State
340
- isLoading, // boolean - model loading
341
- isSpeaking, // boolean - currently speaking
342
- isReady, // boolean - model ready
343
- error, // string | null
344
-
345
- // Voice control
346
- listVoices, // () => VoiceInfo[] - voices for current model
347
- currentVoice, // string
348
- setVoice, // (id: string) => void
349
- currentSpeed, // number
350
- setSpeed, // (speed: number) => void
351
-
352
- // Model info
353
- currentModel, // TTSModelId - "kokoro-82m" | "supertonic-66m"
354
- sampleRate, // number - 24000 (Kokoro) or 44100 (Supertonic)
355
-
356
- // Loading progress
357
- loadingProgress, // { status, file?, progress? }
358
- } = useSpeech({
359
- model: "kokoro-82m", // TTS model: "kokoro-82m" (default) or "supertonic-66m"
360
- voice: "af_heart", // default voice (model-specific)
361
- speed: 1.0, // default speed
362
- autoLoad: false, // load on first speak() call
363
- onReady: () => {},
364
- onError: (err) => {},
365
- onStart: () => {},
366
- onEnd: () => {},
367
- });
368
- ```
369
-
370
- ### Model-Specific Voices
371
-
372
- Each model has its own set of voices:
373
-
374
- ```typescript
375
- // Kokoro (default) - 28 voices
376
- const { listVoices } = useSpeech(); // or model: "kokoro-82m"
377
- // → af_heart, af_bella, am_fenrir, bf_emma, bm_george, ...
378
-
379
- // Supertonic - 4 voices
380
- const { listVoices } = useSpeech({ model: "supertonic-66m" });
381
- // → F1, F2, M1, M2
382
- ```
383
-
384
- **Note:** Voice IDs are model-specific. Using a Kokoro voice with Supertonic (or vice versa) will throw an error.
385
-
386
- ## Performance
387
-
388
- | Metric | Value |
389
- |--------|-------|
390
- | Model size | ~330MB |
391
- | First load | 3-10s (downloads model) |
392
- | Cached load | <1s |
393
- | Generation speed | ~8x realtime on M1 Mac |
394
- | Sample rate | 24kHz |
395
- | Audio format | Mono Float32 PCM |
396
-
397
- ## Troubleshooting
398
-
399
- ### Audio sounds like gibberish
400
-
401
- This happens when using raw transformers.js without proper phoneme conversion. Gerbil uses `kokoro-js` which handles grapheme-to-phoneme (G2P) conversion automatically.
402
-
403
- ### Audio is too quiet
404
-
405
- Kokoro output can be quiet. The audio is normalized by default, but if you're processing raw audio, ensure proper normalization:
406
-
407
- ```typescript
408
- function normalizeAudio(audio: Float32Array, targetRms = 0.15): Float32Array {
409
- const currentRms = Math.sqrt(audio.reduce((a, b) => a + b * b, 0) / audio.length);
410
- const gain = targetRms / currentRms;
411
- return audio.map(s => Math.max(-1, Math.min(1, s * gain)));
412
- }
413
- ```
414
-
415
- ### Browser autoplay blocked
416
-
417
- Browsers require user interaction before playing audio. Trigger speech from a click handler:
418
-
419
- ```tsx
420
- <button onClick={() => speak("Hello!")}>Speak</button>
421
- ```
422
-
423
- ### Voice not found
424
-
425
- Use `listVoices()` to see available voice IDs. Voice IDs follow the pattern `{language}{gender}_{name}`:
426
- - `af_` = American female
427
- - `am_` = American male
428
- - `bf_` = British female
429
- - `bm_` = British male
430
-
431
- ## Model Info
432
-
433
- ### Kokoro-82M
434
-
435
- Uses [Kokoro-82M](https://huggingface.co/hexgrad/Kokoro-82M) via the [ONNX export](https://huggingface.co/onnx-community/Kokoro-82M-v1.0-ONNX) and [kokoro-js](https://www.npmjs.com/package/kokoro-js) wrapper.
436
-
437
- - **Architecture:** StyleTTS2-based
438
- - **Parameters:** 82M
439
- - **Languages:** English (US + UK)
440
- - **License:** Apache 2.0
441
-
442
- ### Supertonic-66M
443
-
444
- Uses [Supertonic-TTS](https://huggingface.co/onnx-community/Supertonic-TTS-ONNX) via transformers.js pipeline.
445
-
446
- - **Parameters:** 66M
447
- - **Languages:** English
448
- - **Sample Rate:** 44.1kHz (HiFi)
449
- - **Speed:** ~167x realtime
450
- - **License:** Apache 2.0
125
+ - **Backbone:** LFM2-350M codec-LM (architecture `KaniTTS2ForCausalLM`)
126
+ - **Codec:** NVIDIA NeMo NanoCodec (FSQ + causal HiFi-GAN), validated bit-exact
127
+ - **Sample rate:** 22.05 kHz
128
+ - **Repos:** `nineninesix/kani-tts-2-en`. License varies by variant (the `kani-tts-2-en`
129
+ checkpoint is LFM1.0/other; a 450M variant is Apache 2.0).
451
130
 
452
131
  ---
453
132
 
454
- ## Adding New TTS Models (Developer Guide)
133
+ ## `Gerbil`-class `speak()` (native wrapper)
455
134
 
456
- To add a new TTS model to Gerbil, implement it in these locations:
457
-
458
- ### 1. Core TTS Class (`src/core/tts.ts`)
459
-
460
- Add your model to the TTS registry and create a class that implements the TTS interface:
135
+ > The Kokoro-82M / Supertonic-66M ONNX/transformers.js TTS lane has been removed. The
136
+ > `Gerbil`-class `speak()` method below now runs the native Kani-TTS-2 engine under the hood
137
+ > (it requires WebGPU and returns 22.05 kHz PCM). The browser `useSpeech` hook is gone — use
138
+ > `useTTS` from `@tryhamster/gerbil/gpu/hooks`. The AI SDK `gerbil.speech()` provider also
139
+ > routes through this native path.
461
140
 
462
141
  ```typescript
463
- // 1. Add voice definitions
464
- export const MY_MODEL_VOICES: VoiceInfo[] = [
465
- { id: "voice1", name: "Voice 1", gender: "female", language: "en", description: "..." },
466
- // ...
467
- ];
468
-
469
- // 2. Add to TTS_MODELS registry
470
- export const TTS_MODELS: Record<string, TTSModelConfig> = {
471
- // ...existing models...
472
- "my-model-50m": {
473
- id: "my-model-50m",
474
- repo: "hf-org/my-model-onnx",
475
- description: "My Model 50M - Description",
476
- size: "~150MB",
477
- sampleRate: 22050,
478
- voices: MY_MODEL_VOICES,
479
- defaultVoice: "voice1",
480
- languages: ["en"],
481
- },
482
- };
483
-
484
- // 3. Create TTS class (see KokoroTTS or SupertonicTTS as examples)
485
- export class MyModelTTS {
486
- async load(options: LoadTTSOptions): Promise<void> { /* ... */ }
487
- async speak(text: string, options: SpeakOptions): Promise<SpeakResult> { /* ... */ }
488
- listVoices(): VoiceInfo[] { /* ... */ }
489
- // ...
490
- }
491
-
492
- // 4. Update createTTS factory
493
- export function createTTS(modelId: string = "kokoro-82m"): TTSBackend {
494
- if (modelId.startsWith("my-model")) {
495
- return new MyModelTTS(modelId);
496
- }
497
- // ...existing logic...
498
- }
499
- ```
500
-
501
- ### 2. Browser Hook (`src/browser/index.ts`)
502
-
503
- Add browser support in the useSpeech hook:
504
-
505
- ```typescript
506
- // 1. Add voice definitions for browser
507
- const MY_MODEL_BROWSER_VOICES: BrowserVoiceInfo[] = [
508
- { id: "voice1", name: "Voice 1", gender: "female", language: "en", description: "..." },
509
- ];
510
-
511
- // 2. Update TTS_MODELS in browser
512
- const TTS_MODELS: Record<TTSModelId, {...}> = {
513
- // ...existing models...
514
- "my-model-50m": {
515
- repo: "hf-org/my-model-onnx",
516
- defaultVoice: "voice1",
517
- sampleRate: 22050,
518
- voices: MY_MODEL_BROWSER_VOICES,
519
- },
520
- };
521
-
522
- // 3. Update TTSModelId type
523
- export type TTSModelId = "kokoro-82m" | "supertonic-66m" | "my-model-50m";
524
-
525
- // 4. Add loading logic in useSpeech useEffect
526
- if (modelId === "my-model-50m") {
527
- // Load using appropriate method (pipeline, custom loader, etc.)
528
- }
529
- ```
530
-
531
- ### 3. Gerbil Class (`src/core/gerbil.ts`)
532
-
533
- Update the main Gerbil class to support the new model:
534
-
535
- ```typescript
536
- // Add type import
537
- type MyModelTTSType = import("./tts.js").MyModelTTS;
142
+ import { Gerbil } from "@tryhamster/gerbil";
538
143
 
539
- // Update TTSBackendType union
540
- type TTSBackendType = KokoroTTSType | SupertonicTTSType | MyModelTTSType;
144
+ const g = new Gerbil();
145
+ const result = await g.speak("Hello!");
146
+ // result.audio = Float32Array, result.sampleRate = 22050 (native Kani-TTS-2)
541
147
  ```
542
-
543
- ### 4. Integration Points
544
-
545
- Update these integration files if applicable:
546
-
547
- - **AI SDK** (`src/integrations/ai-sdk.ts`): Add speech model support
548
- - **Skills** (`src/skills/builtin/speak.ts`, etc.): Update voice enums
549
- - **CLI** (`src/cli/index.ts`): Add model to CLI options
550
-
551
- ### 5. Documentation
552
-
553
- Update documentation:
554
-
555
- - `docs/tts.md`: Add model to tables and examples
556
- - `.cursor/rules/gerbil/tts.mdc`: Update implementation patterns
557
-
558
- ### File Summary
559
-
560
- | File | Purpose |
561
- |------|---------|
562
- | `src/core/tts.ts` | Core TTS classes, voice registry, model config |
563
- | `src/browser/index.ts` | React hooks (useSpeech, useVoiceChat) |
564
- | `src/core/gerbil.ts` | Main Gerbil class integration |
565
- | `src/integrations/ai-sdk.ts` | AI SDK v5 speech model |
566
- | `src/skills/builtin/speak.ts` | Speak skill voice options |
567
- | `docs/tts.md` | User documentation |
568
- | `.cursor/rules/gerbil/tts.mdc` | Developer patterns |
569
-