@tryhamster/gerbil 1.0.0-rc.0 → 1.0.0-rc.10
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +79 -14
- package/dist/auto-update-S9s5-g0C.mjs +3 -0
- package/dist/browser/index.d.ts +1009 -0
- package/dist/browser/index.d.ts.map +1 -0
- package/dist/browser/index.js +2492 -0
- package/dist/browser/index.js.map +1 -0
- package/dist/{chrome-backend-C5Un08O4.mjs → chrome-backend-CORwaIyC.mjs} +514 -73
- package/dist/chrome-backend-CORwaIyC.mjs.map +1 -0
- package/dist/{chrome-backend-CtwPENIW.mjs → chrome-backend-DIKYoWj-.mjs} +1 -1
- package/dist/cli.mjs +3359 -647
- package/dist/cli.mjs.map +1 -1
- package/dist/frameworks/express.d.mts +1 -1
- package/dist/frameworks/express.mjs +3 -4
- package/dist/frameworks/express.mjs.map +1 -1
- package/dist/frameworks/fastify.d.mts +1 -1
- package/dist/frameworks/fastify.mjs +2 -3
- package/dist/frameworks/fastify.mjs.map +1 -1
- package/dist/frameworks/hono.d.mts +1 -1
- package/dist/frameworks/hono.mjs +2 -3
- package/dist/frameworks/hono.mjs.map +1 -1
- package/dist/frameworks/next.d.mts +2 -2
- package/dist/frameworks/next.mjs +2 -3
- package/dist/frameworks/next.mjs.map +1 -1
- package/dist/frameworks/react.d.mts +1 -1
- package/dist/frameworks/trpc.d.mts +1 -1
- package/dist/frameworks/trpc.mjs +2 -3
- package/dist/frameworks/trpc.mjs.map +1 -1
- package/dist/gerbil-DJGqq7BX.mjs +4 -0
- package/dist/gerbil-DoDGHe6Z.mjs +1631 -0
- package/dist/gerbil-DoDGHe6Z.mjs.map +1 -0
- package/dist/gerbil-qOTe1nl2.d.mts +431 -0
- package/dist/gerbil-qOTe1nl2.d.mts.map +1 -0
- package/dist/index.d.mts +411 -9
- package/dist/index.d.mts.map +1 -1
- package/dist/index.mjs +7 -6
- package/dist/index.mjs.map +1 -1
- package/dist/integrations/ai-sdk.d.mts +122 -4
- package/dist/integrations/ai-sdk.d.mts.map +1 -1
- package/dist/integrations/ai-sdk.mjs +238 -11
- package/dist/integrations/ai-sdk.mjs.map +1 -1
- package/dist/integrations/langchain.d.mts +132 -2
- package/dist/integrations/langchain.d.mts.map +1 -1
- package/dist/integrations/langchain.mjs +175 -8
- package/dist/integrations/langchain.mjs.map +1 -1
- package/dist/integrations/llamaindex.d.mts +1 -1
- package/dist/integrations/llamaindex.mjs +2 -3
- package/dist/integrations/llamaindex.mjs.map +1 -1
- package/dist/integrations/mcp-client.mjs +4 -4
- package/dist/integrations/mcp-client.mjs.map +1 -1
- package/dist/integrations/mcp.d.mts +2 -2
- package/dist/integrations/mcp.d.mts.map +1 -1
- package/dist/integrations/mcp.mjs +5 -6
- package/dist/kokoro-BNTb6egA.mjs +20210 -0
- package/dist/kokoro-BNTb6egA.mjs.map +1 -0
- package/dist/kokoro-CMOGDSgT.js +20212 -0
- package/dist/kokoro-CMOGDSgT.js.map +1 -0
- package/dist/{mcp-R8kRLIKb.mjs → mcp-kzDDWIoS.mjs} +10 -37
- package/dist/mcp-kzDDWIoS.mjs.map +1 -0
- package/dist/microphone-DaMZFRuR.mjs +3 -0
- package/dist/{one-liner-BUQR0nqq.mjs → one-liner-DxnNs_JK.mjs} +2 -2
- package/dist/{one-liner-BUQR0nqq.mjs.map → one-liner-DxnNs_JK.mjs.map} +1 -1
- package/dist/repl-DGUw4fCc.mjs +9 -0
- package/dist/skills/index.d.mts +305 -14
- package/dist/skills/index.d.mts.map +1 -1
- package/dist/skills/index.mjs +5 -6
- package/dist/skills-DulrOPeP.mjs +1435 -0
- package/dist/skills-DulrOPeP.mjs.map +1 -0
- package/dist/stt-1WIefHwc.mjs +3 -0
- package/dist/stt-CG_7KB_0.mjs +434 -0
- package/dist/stt-CG_7KB_0.mjs.map +1 -0
- package/dist/stt-Dne6SENv.js +434 -0
- package/dist/stt-Dne6SENv.js.map +1 -0
- package/dist/{tools-BsiEE6f2.mjs → tools-Bi1P7Xoy.mjs} +6 -7
- package/dist/{tools-BsiEE6f2.mjs.map → tools-Bi1P7Xoy.mjs.map} +1 -1
- package/dist/transformers.web-DiD1gTwk.js +44695 -0
- package/dist/transformers.web-DiD1gTwk.js.map +1 -0
- package/dist/transformers.web-u34VxRFM.js +3 -0
- package/dist/tts-B1pZMlDv.mjs +3 -0
- package/dist/tts-C2FzKuSx.js +725 -0
- package/dist/tts-C2FzKuSx.js.map +1 -0
- package/dist/tts-CyHhcLtN.mjs +731 -0
- package/dist/tts-CyHhcLtN.mjs.map +1 -0
- package/dist/types-CiTc7ez3.d.mts +353 -0
- package/dist/types-CiTc7ez3.d.mts.map +1 -0
- package/dist/{utils-7vXqtq2Q.mjs → utils-CZBZ8dgR.mjs} +1 -1
- package/dist/{utils-7vXqtq2Q.mjs.map → utils-CZBZ8dgR.mjs.map} +1 -1
- package/docs/ai-sdk.md +137 -21
- package/docs/browser.md +241 -2
- package/docs/memory.md +72 -0
- package/docs/stt.md +494 -0
- package/docs/tts.md +569 -0
- package/docs/vision.md +396 -0
- package/package.json +21 -22
- package/dist/auto-update-BbNHbSU1.mjs +0 -3
- package/dist/browser/index.d.mts +0 -262
- package/dist/browser/index.d.mts.map +0 -1
- package/dist/browser/index.mjs +0 -755
- package/dist/browser/index.mjs.map +0 -1
- package/dist/chrome-backend-C5Un08O4.mjs.map +0 -1
- package/dist/gerbil-BfnsFWRE.mjs +0 -644
- package/dist/gerbil-BfnsFWRE.mjs.map +0 -1
- package/dist/gerbil-BjW-z7Fq.mjs +0 -5
- package/dist/gerbil-DZ1k3ChC.d.mts +0 -138
- package/dist/gerbil-DZ1k3ChC.d.mts.map +0 -1
- package/dist/mcp-R8kRLIKb.mjs.map +0 -1
- package/dist/models-DKULvhOr.mjs +0 -136
- package/dist/models-DKULvhOr.mjs.map +0 -1
- package/dist/models-De2-_GmQ.d.mts +0 -22
- package/dist/models-De2-_GmQ.d.mts.map +0 -1
- package/dist/skills-D3CEpgDc.mjs +0 -630
- package/dist/skills-D3CEpgDc.mjs.map +0 -1
- package/dist/types-BS1N92Jt.d.mts +0 -183
- package/dist/types-BS1N92Jt.d.mts.map +0 -1
- /package/dist/{chunk-Ct1HF2bE.mjs → chunk-CkXuGtQK.mjs} +0 -0
|
@@ -0,0 +1,725 @@
|
|
|
1
|
+
//#region src/core/tts.ts
|
|
2
|
+
const SENTENCE_SPLIT_REGEX = /(?<=[.!?])\s+/;
|
|
3
|
+
/**
|
|
4
|
+
* Kokoro voice definitions
|
|
5
|
+
* Voice IDs follow pattern: {language}{gender}_{name}
|
|
6
|
+
* - a = American English
|
|
7
|
+
* - b = British English
|
|
8
|
+
* - f = female, m = male
|
|
9
|
+
*/
|
|
10
|
+
const KOKORO_VOICES = [
|
|
11
|
+
{
|
|
12
|
+
id: "af_heart",
|
|
13
|
+
name: "Heart",
|
|
14
|
+
gender: "female",
|
|
15
|
+
language: "en-us",
|
|
16
|
+
description: "American female, highest quality voice (Grade A)",
|
|
17
|
+
embeddingFile: "voices/af_heart.bin"
|
|
18
|
+
},
|
|
19
|
+
{
|
|
20
|
+
id: "af_bella",
|
|
21
|
+
name: "Bella",
|
|
22
|
+
gender: "female",
|
|
23
|
+
language: "en-us",
|
|
24
|
+
description: "American female, warm and friendly (Grade A-)",
|
|
25
|
+
embeddingFile: "voices/af_bella.bin"
|
|
26
|
+
},
|
|
27
|
+
{
|
|
28
|
+
id: "af_nicole",
|
|
29
|
+
name: "Nicole",
|
|
30
|
+
gender: "female",
|
|
31
|
+
language: "en-us",
|
|
32
|
+
description: "American female, soft and gentle (Grade B-)",
|
|
33
|
+
embeddingFile: "voices/af_nicole.bin"
|
|
34
|
+
},
|
|
35
|
+
{
|
|
36
|
+
id: "af_sarah",
|
|
37
|
+
name: "Sarah",
|
|
38
|
+
gender: "female",
|
|
39
|
+
language: "en-us",
|
|
40
|
+
description: "American female, clear and professional (Grade C+)",
|
|
41
|
+
embeddingFile: "voices/af_sarah.bin"
|
|
42
|
+
},
|
|
43
|
+
{
|
|
44
|
+
id: "af_sky",
|
|
45
|
+
name: "Sky",
|
|
46
|
+
gender: "female",
|
|
47
|
+
language: "en-us",
|
|
48
|
+
description: "American female, young and energetic (Grade C-)",
|
|
49
|
+
embeddingFile: "voices/af_sky.bin"
|
|
50
|
+
},
|
|
51
|
+
{
|
|
52
|
+
id: "af_alloy",
|
|
53
|
+
name: "Alloy",
|
|
54
|
+
gender: "female",
|
|
55
|
+
language: "en-us",
|
|
56
|
+
description: "American female (Grade C)",
|
|
57
|
+
embeddingFile: "voices/af_alloy.bin"
|
|
58
|
+
},
|
|
59
|
+
{
|
|
60
|
+
id: "af_aoede",
|
|
61
|
+
name: "Aoede",
|
|
62
|
+
gender: "female",
|
|
63
|
+
language: "en-us",
|
|
64
|
+
description: "American female (Grade C+)",
|
|
65
|
+
embeddingFile: "voices/af_aoede.bin"
|
|
66
|
+
},
|
|
67
|
+
{
|
|
68
|
+
id: "af_kore",
|
|
69
|
+
name: "Kore",
|
|
70
|
+
gender: "female",
|
|
71
|
+
language: "en-us",
|
|
72
|
+
description: "American female (Grade C+)",
|
|
73
|
+
embeddingFile: "voices/af_kore.bin"
|
|
74
|
+
},
|
|
75
|
+
{
|
|
76
|
+
id: "af_nova",
|
|
77
|
+
name: "Nova",
|
|
78
|
+
gender: "female",
|
|
79
|
+
language: "en-us",
|
|
80
|
+
description: "American female (Grade C)",
|
|
81
|
+
embeddingFile: "voices/af_nova.bin"
|
|
82
|
+
},
|
|
83
|
+
{
|
|
84
|
+
id: "af_river",
|
|
85
|
+
name: "River",
|
|
86
|
+
gender: "female",
|
|
87
|
+
language: "en-us",
|
|
88
|
+
description: "American female (Grade D)",
|
|
89
|
+
embeddingFile: "voices/af_river.bin"
|
|
90
|
+
},
|
|
91
|
+
{
|
|
92
|
+
id: "af_jessica",
|
|
93
|
+
name: "Jessica",
|
|
94
|
+
gender: "female",
|
|
95
|
+
language: "en-us",
|
|
96
|
+
description: "American female (Grade D)",
|
|
97
|
+
embeddingFile: "voices/af_jessica.bin"
|
|
98
|
+
},
|
|
99
|
+
{
|
|
100
|
+
id: "am_fenrir",
|
|
101
|
+
name: "Fenrir",
|
|
102
|
+
gender: "male",
|
|
103
|
+
language: "en-us",
|
|
104
|
+
description: "American male, best quality (Grade C+)",
|
|
105
|
+
embeddingFile: "voices/am_fenrir.bin"
|
|
106
|
+
},
|
|
107
|
+
{
|
|
108
|
+
id: "am_michael",
|
|
109
|
+
name: "Michael",
|
|
110
|
+
gender: "male",
|
|
111
|
+
language: "en-us",
|
|
112
|
+
description: "American male, warm and friendly (Grade C+)",
|
|
113
|
+
embeddingFile: "voices/am_michael.bin"
|
|
114
|
+
},
|
|
115
|
+
{
|
|
116
|
+
id: "am_puck",
|
|
117
|
+
name: "Puck",
|
|
118
|
+
gender: "male",
|
|
119
|
+
language: "en-us",
|
|
120
|
+
description: "American male (Grade C+)",
|
|
121
|
+
embeddingFile: "voices/am_puck.bin"
|
|
122
|
+
},
|
|
123
|
+
{
|
|
124
|
+
id: "am_adam",
|
|
125
|
+
name: "Adam",
|
|
126
|
+
gender: "male",
|
|
127
|
+
language: "en-us",
|
|
128
|
+
description: "American male, deep voice (Grade F+)",
|
|
129
|
+
embeddingFile: "voices/am_adam.bin"
|
|
130
|
+
},
|
|
131
|
+
{
|
|
132
|
+
id: "am_echo",
|
|
133
|
+
name: "Echo",
|
|
134
|
+
gender: "male",
|
|
135
|
+
language: "en-us",
|
|
136
|
+
description: "American male (Grade D)",
|
|
137
|
+
embeddingFile: "voices/am_echo.bin"
|
|
138
|
+
},
|
|
139
|
+
{
|
|
140
|
+
id: "am_eric",
|
|
141
|
+
name: "Eric",
|
|
142
|
+
gender: "male",
|
|
143
|
+
language: "en-us",
|
|
144
|
+
description: "American male (Grade D)",
|
|
145
|
+
embeddingFile: "voices/am_eric.bin"
|
|
146
|
+
},
|
|
147
|
+
{
|
|
148
|
+
id: "am_liam",
|
|
149
|
+
name: "Liam",
|
|
150
|
+
gender: "male",
|
|
151
|
+
language: "en-us",
|
|
152
|
+
description: "American male (Grade D)",
|
|
153
|
+
embeddingFile: "voices/am_liam.bin"
|
|
154
|
+
},
|
|
155
|
+
{
|
|
156
|
+
id: "am_onyx",
|
|
157
|
+
name: "Onyx",
|
|
158
|
+
gender: "male",
|
|
159
|
+
language: "en-us",
|
|
160
|
+
description: "American male (Grade D)",
|
|
161
|
+
embeddingFile: "voices/am_onyx.bin"
|
|
162
|
+
},
|
|
163
|
+
{
|
|
164
|
+
id: "am_santa",
|
|
165
|
+
name: "Santa",
|
|
166
|
+
gender: "male",
|
|
167
|
+
language: "en-us",
|
|
168
|
+
description: "American male, festive (Grade D-)",
|
|
169
|
+
embeddingFile: "voices/am_santa.bin"
|
|
170
|
+
},
|
|
171
|
+
{
|
|
172
|
+
id: "bf_emma",
|
|
173
|
+
name: "Emma",
|
|
174
|
+
gender: "female",
|
|
175
|
+
language: "en-gb",
|
|
176
|
+
description: "British female, elegant and clear (Grade B-)",
|
|
177
|
+
embeddingFile: "voices/bf_emma.bin"
|
|
178
|
+
},
|
|
179
|
+
{
|
|
180
|
+
id: "bf_isabella",
|
|
181
|
+
name: "Isabella",
|
|
182
|
+
gender: "female",
|
|
183
|
+
language: "en-gb",
|
|
184
|
+
description: "British female, sophisticated (Grade C)",
|
|
185
|
+
embeddingFile: "voices/bf_isabella.bin"
|
|
186
|
+
},
|
|
187
|
+
{
|
|
188
|
+
id: "bf_alice",
|
|
189
|
+
name: "Alice",
|
|
190
|
+
gender: "female",
|
|
191
|
+
language: "en-gb",
|
|
192
|
+
description: "British female (Grade D)",
|
|
193
|
+
embeddingFile: "voices/bf_alice.bin"
|
|
194
|
+
},
|
|
195
|
+
{
|
|
196
|
+
id: "bf_lily",
|
|
197
|
+
name: "Lily",
|
|
198
|
+
gender: "female",
|
|
199
|
+
language: "en-gb",
|
|
200
|
+
description: "British female (Grade D)",
|
|
201
|
+
embeddingFile: "voices/bf_lily.bin"
|
|
202
|
+
},
|
|
203
|
+
{
|
|
204
|
+
id: "bm_george",
|
|
205
|
+
name: "George",
|
|
206
|
+
gender: "male",
|
|
207
|
+
language: "en-gb",
|
|
208
|
+
description: "British male, distinguished (Grade C)",
|
|
209
|
+
embeddingFile: "voices/bm_george.bin"
|
|
210
|
+
},
|
|
211
|
+
{
|
|
212
|
+
id: "bm_fable",
|
|
213
|
+
name: "Fable",
|
|
214
|
+
gender: "male",
|
|
215
|
+
language: "en-gb",
|
|
216
|
+
description: "British male (Grade C)",
|
|
217
|
+
embeddingFile: "voices/bm_fable.bin"
|
|
218
|
+
},
|
|
219
|
+
{
|
|
220
|
+
id: "bm_lewis",
|
|
221
|
+
name: "Lewis",
|
|
222
|
+
gender: "male",
|
|
223
|
+
language: "en-gb",
|
|
224
|
+
description: "British male, friendly (Grade D+)",
|
|
225
|
+
embeddingFile: "voices/bm_lewis.bin"
|
|
226
|
+
},
|
|
227
|
+
{
|
|
228
|
+
id: "bm_daniel",
|
|
229
|
+
name: "Daniel",
|
|
230
|
+
gender: "male",
|
|
231
|
+
language: "en-gb",
|
|
232
|
+
description: "British male (Grade D)",
|
|
233
|
+
embeddingFile: "voices/bm_daniel.bin"
|
|
234
|
+
}
|
|
235
|
+
];
|
|
236
|
+
/**
|
|
237
|
+
* Supertonic voice definitions
|
|
238
|
+
* 4 built-in voices: F1, F2 (female), M1, M2 (male)
|
|
239
|
+
*/
|
|
240
|
+
const SUPERTONIC_VOICES = [
|
|
241
|
+
{
|
|
242
|
+
id: "F1",
|
|
243
|
+
name: "Female 1",
|
|
244
|
+
gender: "female",
|
|
245
|
+
language: "en",
|
|
246
|
+
description: "Female voice 1 - Clear and natural",
|
|
247
|
+
embeddingFile: "voices/F1.bin"
|
|
248
|
+
},
|
|
249
|
+
{
|
|
250
|
+
id: "F2",
|
|
251
|
+
name: "Female 2",
|
|
252
|
+
gender: "female",
|
|
253
|
+
language: "en",
|
|
254
|
+
description: "Female voice 2 - Warm and expressive",
|
|
255
|
+
embeddingFile: "voices/F2.bin"
|
|
256
|
+
},
|
|
257
|
+
{
|
|
258
|
+
id: "M1",
|
|
259
|
+
name: "Male 1",
|
|
260
|
+
gender: "male",
|
|
261
|
+
language: "en",
|
|
262
|
+
description: "Male voice 1 - Deep and confident",
|
|
263
|
+
embeddingFile: "voices/M1.bin"
|
|
264
|
+
},
|
|
265
|
+
{
|
|
266
|
+
id: "M2",
|
|
267
|
+
name: "Male 2",
|
|
268
|
+
gender: "male",
|
|
269
|
+
language: "en",
|
|
270
|
+
description: "Male voice 2 - Friendly and casual",
|
|
271
|
+
embeddingFile: "voices/M2.bin"
|
|
272
|
+
}
|
|
273
|
+
];
|
|
274
|
+
const TTS_MODELS = {
|
|
275
|
+
"kokoro-82m": {
|
|
276
|
+
id: "kokoro-82m",
|
|
277
|
+
repo: "onnx-community/Kokoro-82M-v1.0-ONNX",
|
|
278
|
+
description: "Kokoro 82M - High-quality multilingual TTS",
|
|
279
|
+
size: "~330MB",
|
|
280
|
+
sampleRate: 24e3,
|
|
281
|
+
voices: KOKORO_VOICES,
|
|
282
|
+
defaultVoice: "af_heart",
|
|
283
|
+
languages: ["en-us", "en-gb"]
|
|
284
|
+
},
|
|
285
|
+
"supertonic-66m": {
|
|
286
|
+
id: "supertonic-66m",
|
|
287
|
+
repo: "onnx-community/Supertonic-TTS-ONNX",
|
|
288
|
+
description: "Supertonic 66M - Fast on-device TTS (167x realtime)",
|
|
289
|
+
size: "~250MB",
|
|
290
|
+
sampleRate: 44100,
|
|
291
|
+
voices: SUPERTONIC_VOICES,
|
|
292
|
+
defaultVoice: "F1",
|
|
293
|
+
languages: ["en"]
|
|
294
|
+
}
|
|
295
|
+
};
|
|
296
|
+
/**
|
|
297
|
+
* Get TTS model config by ID
|
|
298
|
+
*/
|
|
299
|
+
function getTTSModelConfig(modelId) {
|
|
300
|
+
return TTS_MODELS[modelId] || null;
|
|
301
|
+
}
|
|
302
|
+
/**
|
|
303
|
+
* Kokoro TTS - Local text-to-speech with voice selection
|
|
304
|
+
*
|
|
305
|
+
* Uses kokoro-js (official Kokoro library by xenova) for high-quality speech synthesis.
|
|
306
|
+
* Includes proper G2P (grapheme-to-phoneme) conversion for accurate pronunciation.
|
|
307
|
+
*/
|
|
308
|
+
var KokoroTTS = class {
|
|
309
|
+
kokoroInstance = null;
|
|
310
|
+
modelConfig;
|
|
311
|
+
loadPromise = null;
|
|
312
|
+
_isLoaded = false;
|
|
313
|
+
_deviceMode = "cpu";
|
|
314
|
+
constructor(modelId = "kokoro-82m") {
|
|
315
|
+
const config = getTTSModelConfig(modelId);
|
|
316
|
+
if (!config) throw new Error(`Unknown TTS model: ${modelId}. Available: ${Object.keys(TTS_MODELS).join(", ")}`);
|
|
317
|
+
this.modelConfig = config;
|
|
318
|
+
}
|
|
319
|
+
/**
|
|
320
|
+
* Load the TTS model
|
|
321
|
+
*
|
|
322
|
+
* @example
|
|
323
|
+
* ```ts
|
|
324
|
+
* const tts = new KokoroTTS();
|
|
325
|
+
* await tts.load({
|
|
326
|
+
* onProgress: (p) => console.log(p.status, p.progress),
|
|
327
|
+
* device: "webgpu",
|
|
328
|
+
* });
|
|
329
|
+
* ```
|
|
330
|
+
*/
|
|
331
|
+
async load(options = {}) {
|
|
332
|
+
if (this._isLoaded) return;
|
|
333
|
+
if (this.loadPromise) return this.loadPromise;
|
|
334
|
+
this.loadPromise = this._load(options);
|
|
335
|
+
await this.loadPromise;
|
|
336
|
+
}
|
|
337
|
+
async _load(options = {}) {
|
|
338
|
+
const { onProgress, device = "auto" } = options;
|
|
339
|
+
onProgress?.({ status: `Loading TTS model (${this.modelConfig.id})...` });
|
|
340
|
+
try {
|
|
341
|
+
const { KokoroTTS: KokoroJS } = await import("./kokoro-CMOGDSgT.js");
|
|
342
|
+
const isBrowser = typeof window !== "undefined";
|
|
343
|
+
let dtype = "fp32";
|
|
344
|
+
if (device === "webgpu" || device === "auto" && isBrowser && "gpu" in navigator) {
|
|
345
|
+
dtype = "fp16";
|
|
346
|
+
this._deviceMode = "webgpu";
|
|
347
|
+
} else {
|
|
348
|
+
dtype = "fp32";
|
|
349
|
+
this._deviceMode = "cpu";
|
|
350
|
+
}
|
|
351
|
+
onProgress?.({ status: `Loading model with ${dtype} precision...` });
|
|
352
|
+
this.kokoroInstance = await KokoroJS.from_pretrained(this.modelConfig.repo, {
|
|
353
|
+
dtype,
|
|
354
|
+
progress_callback: (progress) => {
|
|
355
|
+
if (progress.status === "progress" && progress.file) onProgress?.({
|
|
356
|
+
status: `Downloading ${progress.file}`,
|
|
357
|
+
progress: Math.round(progress.progress || 0),
|
|
358
|
+
file: progress.file
|
|
359
|
+
});
|
|
360
|
+
else if (progress.status === "ready") onProgress?.({ status: "Model ready" });
|
|
361
|
+
}
|
|
362
|
+
});
|
|
363
|
+
this._isLoaded = true;
|
|
364
|
+
onProgress?.({ status: `Ready (${this._deviceMode.toUpperCase()})!` });
|
|
365
|
+
} catch (error) {
|
|
366
|
+
this.loadPromise = null;
|
|
367
|
+
throw error;
|
|
368
|
+
}
|
|
369
|
+
}
|
|
370
|
+
/**
|
|
371
|
+
* Ensure model is loaded (lazy loading)
|
|
372
|
+
*/
|
|
373
|
+
async ensureLoaded(options) {
|
|
374
|
+
if (!this._isLoaded) await this.load(options);
|
|
375
|
+
}
|
|
376
|
+
/**
|
|
377
|
+
* Get list of available voices
|
|
378
|
+
*
|
|
379
|
+
* @example
|
|
380
|
+
* ```ts
|
|
381
|
+
* const voices = tts.listVoices();
|
|
382
|
+
* // [{ id: "af_heart", name: "Heart", gender: "female", ... }, ...]
|
|
383
|
+
* ```
|
|
384
|
+
*/
|
|
385
|
+
listVoices() {
|
|
386
|
+
return [...this.modelConfig.voices];
|
|
387
|
+
}
|
|
388
|
+
/**
|
|
389
|
+
* Get a specific voice by ID
|
|
390
|
+
*/
|
|
391
|
+
getVoice(voiceId) {
|
|
392
|
+
return this.modelConfig.voices.find((v) => v.id === voiceId) || null;
|
|
393
|
+
}
|
|
394
|
+
/**
|
|
395
|
+
* Get voices by gender
|
|
396
|
+
*/
|
|
397
|
+
getVoicesByGender(gender) {
|
|
398
|
+
return this.modelConfig.voices.filter((v) => v.gender === gender);
|
|
399
|
+
}
|
|
400
|
+
/**
|
|
401
|
+
* Get voices by language
|
|
402
|
+
*/
|
|
403
|
+
getVoicesByLanguage(language) {
|
|
404
|
+
return this.modelConfig.voices.filter((v) => v.language === language || v.language.startsWith(language));
|
|
405
|
+
}
|
|
406
|
+
/**
|
|
407
|
+
* Generate speech from text
|
|
408
|
+
*
|
|
409
|
+
* @example
|
|
410
|
+
* ```ts
|
|
411
|
+
* const result = await tts.speak("Hello world", {
|
|
412
|
+
* voice: "af_heart",
|
|
413
|
+
* speed: 1.0,
|
|
414
|
+
* });
|
|
415
|
+
*
|
|
416
|
+
* // Play in browser
|
|
417
|
+
* const audioContext = new AudioContext();
|
|
418
|
+
* const buffer = audioContext.createBuffer(1, result.audio.length, result.sampleRate);
|
|
419
|
+
* buffer.copyToChannel(result.audio, 0);
|
|
420
|
+
* const source = audioContext.createBufferSource();
|
|
421
|
+
* source.buffer = buffer;
|
|
422
|
+
* source.connect(audioContext.destination);
|
|
423
|
+
* source.start();
|
|
424
|
+
* ```
|
|
425
|
+
*/
|
|
426
|
+
async speak(text, options = {}) {
|
|
427
|
+
await this.ensureLoaded({ onProgress: options.onProgress });
|
|
428
|
+
const { voice = this.modelConfig.defaultVoice, speed = 1 } = options;
|
|
429
|
+
if (!this.getVoice(voice)) throw new Error(`Unknown voice: ${voice}. Use listVoices() to see available options.`);
|
|
430
|
+
if (!this.kokoroInstance) throw new Error("Model not loaded");
|
|
431
|
+
const startTime = performance.now();
|
|
432
|
+
const result = await this.kokoroInstance.generate(text, {
|
|
433
|
+
voice,
|
|
434
|
+
speed
|
|
435
|
+
});
|
|
436
|
+
const totalTime = performance.now() - startTime;
|
|
437
|
+
return {
|
|
438
|
+
audio: result.audio,
|
|
439
|
+
sampleRate: result.sampling_rate,
|
|
440
|
+
duration: result.audio.length / result.sampling_rate,
|
|
441
|
+
voice,
|
|
442
|
+
totalTime
|
|
443
|
+
};
|
|
444
|
+
}
|
|
445
|
+
/**
|
|
446
|
+
* Stream speech generation (yields audio chunks as they're generated)
|
|
447
|
+
*
|
|
448
|
+
* @example
|
|
449
|
+
* ```ts
|
|
450
|
+
* for await (const chunk of tts.speakStream("Long text...")) {
|
|
451
|
+
* // chunk.samples = Float32Array
|
|
452
|
+
* // chunk.sampleRate = 24000
|
|
453
|
+
* // chunk.isFinal = boolean
|
|
454
|
+
* playChunk(chunk);
|
|
455
|
+
* }
|
|
456
|
+
* ```
|
|
457
|
+
*/
|
|
458
|
+
async *speakStream(text, options = {}) {
|
|
459
|
+
await this.ensureLoaded({ onProgress: options.onProgress });
|
|
460
|
+
const { voice = this.modelConfig.defaultVoice, speed = 1 } = options;
|
|
461
|
+
if (!this.getVoice(voice)) throw new Error(`Unknown voice: ${voice}. Use listVoices() to see available options.`);
|
|
462
|
+
if (!this.kokoroInstance) throw new Error("Model not loaded");
|
|
463
|
+
const startTime = performance.now();
|
|
464
|
+
const sentences = this.splitIntoSentences(text);
|
|
465
|
+
const allAudio = [];
|
|
466
|
+
let chunkIndex = 0;
|
|
467
|
+
let sampleRate = this.modelConfig.sampleRate;
|
|
468
|
+
for (let i = 0; i < sentences.length; i++) {
|
|
469
|
+
const sentence = sentences[i];
|
|
470
|
+
if (!sentence.trim()) continue;
|
|
471
|
+
const result = await this.kokoroInstance.generate(sentence, {
|
|
472
|
+
voice,
|
|
473
|
+
speed
|
|
474
|
+
});
|
|
475
|
+
sampleRate = result.sampling_rate;
|
|
476
|
+
allAudio.push(result.audio);
|
|
477
|
+
const chunk = {
|
|
478
|
+
samples: result.audio,
|
|
479
|
+
sampleRate: result.sampling_rate,
|
|
480
|
+
index: chunkIndex++,
|
|
481
|
+
isFinal: i === sentences.length - 1
|
|
482
|
+
};
|
|
483
|
+
yield chunk;
|
|
484
|
+
options.onAudioChunk?.(chunk);
|
|
485
|
+
}
|
|
486
|
+
const totalLength = allAudio.reduce((sum, arr) => sum + arr.length, 0);
|
|
487
|
+
const fullAudio = new Float32Array(totalLength);
|
|
488
|
+
let offset = 0;
|
|
489
|
+
for (const chunk of allAudio) {
|
|
490
|
+
fullAudio.set(chunk, offset);
|
|
491
|
+
offset += chunk.length;
|
|
492
|
+
}
|
|
493
|
+
const totalTime = performance.now() - startTime;
|
|
494
|
+
return {
|
|
495
|
+
audio: fullAudio,
|
|
496
|
+
sampleRate,
|
|
497
|
+
duration: fullAudio.length / sampleRate,
|
|
498
|
+
voice,
|
|
499
|
+
totalTime
|
|
500
|
+
};
|
|
501
|
+
}
|
|
502
|
+
/**
|
|
503
|
+
* Split text into sentences for streaming
|
|
504
|
+
*/
|
|
505
|
+
splitIntoSentences(text) {
|
|
506
|
+
return text.split(SENTENCE_SPLIT_REGEX).filter((s) => s.trim());
|
|
507
|
+
}
|
|
508
|
+
/**
|
|
509
|
+
* Check if model is loaded
|
|
510
|
+
*/
|
|
511
|
+
isLoaded() {
|
|
512
|
+
return this._isLoaded;
|
|
513
|
+
}
|
|
514
|
+
/**
|
|
515
|
+
* Get current device mode
|
|
516
|
+
*/
|
|
517
|
+
getDeviceMode() {
|
|
518
|
+
return this._deviceMode;
|
|
519
|
+
}
|
|
520
|
+
/**
|
|
521
|
+
* Get model configuration
|
|
522
|
+
*/
|
|
523
|
+
getModelInfo() {
|
|
524
|
+
return { ...this.modelConfig };
|
|
525
|
+
}
|
|
526
|
+
/**
|
|
527
|
+
* Get sample rate
|
|
528
|
+
*/
|
|
529
|
+
getSampleRate() {
|
|
530
|
+
return this.modelConfig.sampleRate;
|
|
531
|
+
}
|
|
532
|
+
/**
|
|
533
|
+
* Dispose of resources
|
|
534
|
+
*/
|
|
535
|
+
async dispose() {
|
|
536
|
+
this.kokoroInstance = null;
|
|
537
|
+
this._isLoaded = false;
|
|
538
|
+
this.loadPromise = null;
|
|
539
|
+
}
|
|
540
|
+
};
|
|
541
|
+
/**
|
|
542
|
+
* Supertonic TTS - Fast on-device text-to-speech
|
|
543
|
+
*
|
|
544
|
+
* Uses transformers.js with the Supertonic-TTS-ONNX model.
|
|
545
|
+
* Generates speech at 167x realtime with 66M parameters.
|
|
546
|
+
* Outputs at 44100 Hz sample rate.
|
|
547
|
+
*/
|
|
548
|
+
var SupertonicTTS = class {
|
|
549
|
+
pipeline = null;
|
|
550
|
+
modelConfig;
|
|
551
|
+
loadPromise = null;
|
|
552
|
+
_isLoaded = false;
|
|
553
|
+
_deviceMode = "cpu";
|
|
554
|
+
voiceEmbeddings = /* @__PURE__ */ new Map();
|
|
555
|
+
constructor(modelId = "supertonic-66m") {
|
|
556
|
+
const config = getTTSModelConfig(modelId);
|
|
557
|
+
if (!config) throw new Error(`Unknown TTS model: ${modelId}. Available: ${Object.keys(TTS_MODELS).join(", ")}`);
|
|
558
|
+
this.modelConfig = config;
|
|
559
|
+
}
|
|
560
|
+
/**
|
|
561
|
+
* Load the TTS model
|
|
562
|
+
*/
|
|
563
|
+
async load(options = {}) {
|
|
564
|
+
if (this._isLoaded) return;
|
|
565
|
+
if (this.loadPromise) return this.loadPromise;
|
|
566
|
+
this.loadPromise = this._load(options);
|
|
567
|
+
await this.loadPromise;
|
|
568
|
+
}
|
|
569
|
+
async _load(options = {}) {
|
|
570
|
+
const { onProgress, device = "auto" } = options;
|
|
571
|
+
onProgress?.({ status: `Loading TTS model (${this.modelConfig.id})...` });
|
|
572
|
+
try {
|
|
573
|
+
const isBrowser = typeof window !== "undefined";
|
|
574
|
+
const { pipeline, env } = await import("./transformers.web-u34VxRFM.js");
|
|
575
|
+
if (isBrowser && env.backends?.onnx?.wasm) env.backends.onnx.wasm.wasmPaths = "https://cdn.jsdelivr.net/npm/onnxruntime-web@1.21.0/dist/";
|
|
576
|
+
if (device === "webgpu" || device === "auto" && isBrowser && "gpu" in navigator) this._deviceMode = "webgpu";
|
|
577
|
+
else this._deviceMode = "cpu";
|
|
578
|
+
onProgress?.({ status: `Loading Supertonic model...` });
|
|
579
|
+
this.pipeline = await pipeline("text-to-speech", this.modelConfig.repo, {
|
|
580
|
+
dtype: "fp32",
|
|
581
|
+
device: this._deviceMode,
|
|
582
|
+
progress_callback: (progress) => {
|
|
583
|
+
if (progress.status === "progress" && progress.file) onProgress?.({
|
|
584
|
+
status: `Downloading ${progress.file}`,
|
|
585
|
+
progress: Math.round(progress.progress || 0),
|
|
586
|
+
file: progress.file
|
|
587
|
+
});
|
|
588
|
+
}
|
|
589
|
+
});
|
|
590
|
+
onProgress?.({ status: "Loading voice embeddings..." });
|
|
591
|
+
await this.loadVoiceEmbeddings();
|
|
592
|
+
this._isLoaded = true;
|
|
593
|
+
onProgress?.({ status: `Ready (${this._deviceMode.toUpperCase()})!` });
|
|
594
|
+
} catch (error) {
|
|
595
|
+
this.loadPromise = null;
|
|
596
|
+
throw error;
|
|
597
|
+
}
|
|
598
|
+
}
|
|
599
|
+
/**
|
|
600
|
+
* Load speaker embeddings for all voices
|
|
601
|
+
* Supertonic uses 101x128 = 12,928 floats per voice
|
|
602
|
+
*/
|
|
603
|
+
async loadVoiceEmbeddings() {}
|
|
604
|
+
async ensureLoaded(options) {
|
|
605
|
+
if (!this._isLoaded) await this.load(options);
|
|
606
|
+
}
|
|
607
|
+
listVoices() {
|
|
608
|
+
return [...this.modelConfig.voices];
|
|
609
|
+
}
|
|
610
|
+
getVoice(voiceId) {
|
|
611
|
+
return this.modelConfig.voices.find((v) => v.id === voiceId) || null;
|
|
612
|
+
}
|
|
613
|
+
getVoicesByGender(gender) {
|
|
614
|
+
return this.modelConfig.voices.filter((v) => v.gender === gender);
|
|
615
|
+
}
|
|
616
|
+
/**
|
|
617
|
+
* Generate speech from text
|
|
618
|
+
*/
|
|
619
|
+
async speak(text, options = {}) {
|
|
620
|
+
await this.ensureLoaded({ onProgress: options.onProgress });
|
|
621
|
+
const { voice = this.modelConfig.defaultVoice } = options;
|
|
622
|
+
if (!this.getVoice(voice)) throw new Error(`Unknown voice: ${voice}. Use listVoices() to see available options.`);
|
|
623
|
+
if (!this.pipeline) throw new Error("Model not loaded");
|
|
624
|
+
const startTime = performance.now();
|
|
625
|
+
let speakerEmbedding = this.voiceEmbeddings.get(voice);
|
|
626
|
+
if (!speakerEmbedding) try {
|
|
627
|
+
const voiceUrl = `https://huggingface.co/${this.modelConfig.repo}/resolve/main/voices/${voice}.bin`;
|
|
628
|
+
const response = await fetch(voiceUrl);
|
|
629
|
+
if (response.ok) {
|
|
630
|
+
const buffer = await response.arrayBuffer();
|
|
631
|
+
speakerEmbedding = new Float32Array(buffer);
|
|
632
|
+
this.voiceEmbeddings.set(voice, speakerEmbedding);
|
|
633
|
+
} else throw new Error(`Failed to load voice: ${response.status}`);
|
|
634
|
+
} catch {
|
|
635
|
+
speakerEmbedding = new Float32Array(12928).fill(.1);
|
|
636
|
+
this.voiceEmbeddings.set(voice, speakerEmbedding);
|
|
637
|
+
}
|
|
638
|
+
const result = await this.pipeline(text, { speaker_embeddings: speakerEmbedding });
|
|
639
|
+
const totalTime = performance.now() - startTime;
|
|
640
|
+
const audio = result.audio;
|
|
641
|
+
const sampleRate = result.sampling_rate;
|
|
642
|
+
return {
|
|
643
|
+
audio,
|
|
644
|
+
sampleRate,
|
|
645
|
+
duration: audio.length / sampleRate,
|
|
646
|
+
voice,
|
|
647
|
+
totalTime
|
|
648
|
+
};
|
|
649
|
+
}
|
|
650
|
+
/**
|
|
651
|
+
* Stream speech generation
|
|
652
|
+
*/
|
|
653
|
+
async *speakStream(text, options = {}) {
|
|
654
|
+
await this.ensureLoaded({ onProgress: options.onProgress });
|
|
655
|
+
const { voice = this.modelConfig.defaultVoice, speed = 1 } = options;
|
|
656
|
+
if (!this.getVoice(voice)) throw new Error(`Unknown voice: ${voice}. Use listVoices() to see available options.`);
|
|
657
|
+
const startTime = performance.now();
|
|
658
|
+
const sentences = text.split(SENTENCE_SPLIT_REGEX).filter((s) => s.trim());
|
|
659
|
+
const allAudio = [];
|
|
660
|
+
let chunkIndex = 0;
|
|
661
|
+
let sampleRate = this.modelConfig.sampleRate;
|
|
662
|
+
for (let i = 0; i < sentences.length; i++) {
|
|
663
|
+
const sentence = sentences[i];
|
|
664
|
+
if (!sentence.trim()) continue;
|
|
665
|
+
const result = await this.speak(sentence, {
|
|
666
|
+
voice,
|
|
667
|
+
speed
|
|
668
|
+
});
|
|
669
|
+
sampleRate = result.sampleRate;
|
|
670
|
+
allAudio.push(result.audio);
|
|
671
|
+
const chunk = {
|
|
672
|
+
samples: result.audio,
|
|
673
|
+
sampleRate: result.sampleRate,
|
|
674
|
+
index: chunkIndex++,
|
|
675
|
+
isFinal: i === sentences.length - 1
|
|
676
|
+
};
|
|
677
|
+
yield chunk;
|
|
678
|
+
options.onAudioChunk?.(chunk);
|
|
679
|
+
}
|
|
680
|
+
const totalLength = allAudio.reduce((sum, arr) => sum + arr.length, 0);
|
|
681
|
+
const fullAudio = new Float32Array(totalLength);
|
|
682
|
+
let offset = 0;
|
|
683
|
+
for (const chunk of allAudio) {
|
|
684
|
+
fullAudio.set(chunk, offset);
|
|
685
|
+
offset += chunk.length;
|
|
686
|
+
}
|
|
687
|
+
const totalTime = performance.now() - startTime;
|
|
688
|
+
return {
|
|
689
|
+
audio: fullAudio,
|
|
690
|
+
sampleRate,
|
|
691
|
+
duration: fullAudio.length / sampleRate,
|
|
692
|
+
voice,
|
|
693
|
+
totalTime
|
|
694
|
+
};
|
|
695
|
+
}
|
|
696
|
+
isLoaded() {
|
|
697
|
+
return this._isLoaded;
|
|
698
|
+
}
|
|
699
|
+
getDeviceMode() {
|
|
700
|
+
return this._deviceMode;
|
|
701
|
+
}
|
|
702
|
+
getModelInfo() {
|
|
703
|
+
return { ...this.modelConfig };
|
|
704
|
+
}
|
|
705
|
+
getSampleRate() {
|
|
706
|
+
return this.modelConfig.sampleRate;
|
|
707
|
+
}
|
|
708
|
+
async dispose() {
|
|
709
|
+
this.pipeline = null;
|
|
710
|
+
this.voiceEmbeddings.clear();
|
|
711
|
+
this._isLoaded = false;
|
|
712
|
+
this.loadPromise = null;
|
|
713
|
+
}
|
|
714
|
+
};
|
|
715
|
+
/**
|
|
716
|
+
* Create a TTS instance based on model ID
|
|
717
|
+
*/
|
|
718
|
+
function createTTS(modelId = "kokoro-82m") {
|
|
719
|
+
if (modelId.startsWith("supertonic")) return new SupertonicTTS(modelId);
|
|
720
|
+
return new KokoroTTS(modelId);
|
|
721
|
+
}
|
|
722
|
+
|
|
723
|
+
//#endregion
|
|
724
|
+
export { createTTS };
|
|
725
|
+
//# sourceMappingURL=tts-C2FzKuSx.js.map
|