@tryhamster/gerbil 1.0.0-rc.1 → 1.0.0-rc.11
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +14 -5
- package/dist/browser/{index.d.mts → index.d.ts} +354 -3
- package/dist/browser/index.d.ts.map +1 -0
- package/dist/browser/{index.mjs → index.js} +631 -259
- package/dist/browser/index.js.map +1 -0
- package/dist/{chrome-backend-Y9F7W5VQ.mjs → chrome-backend-CORwaIyC.mjs} +1 -1
- package/dist/{chrome-backend-Y9F7W5VQ.mjs.map → chrome-backend-CORwaIyC.mjs.map} +1 -1
- package/dist/{chrome-backend-JEPeM2YE.mjs → chrome-backend-DIKYoWj-.mjs} +1 -1
- package/dist/cli.mjs +14 -15
- package/dist/cli.mjs.map +1 -1
- package/dist/frameworks/express.d.mts +1 -1
- package/dist/frameworks/express.mjs +3 -4
- package/dist/frameworks/express.mjs.map +1 -1
- package/dist/frameworks/fastify.d.mts +1 -1
- package/dist/frameworks/fastify.mjs +2 -3
- package/dist/frameworks/fastify.mjs.map +1 -1
- package/dist/frameworks/hono.d.mts +1 -1
- package/dist/frameworks/hono.mjs +2 -3
- package/dist/frameworks/hono.mjs.map +1 -1
- package/dist/frameworks/next.d.mts +2 -2
- package/dist/frameworks/next.mjs +2 -3
- package/dist/frameworks/next.mjs.map +1 -1
- package/dist/frameworks/react.d.mts +1 -1
- package/dist/frameworks/trpc.d.mts +1 -1
- package/dist/frameworks/trpc.mjs +2 -3
- package/dist/frameworks/trpc.mjs.map +1 -1
- package/dist/gerbil-DJGqq7BX.mjs +4 -0
- package/dist/{gerbil-yoSpRHgv.mjs → gerbil-DoDGHe6Z.mjs} +187 -19
- package/dist/gerbil-DoDGHe6Z.mjs.map +1 -0
- package/dist/{gerbil-POAz8peb.d.mts → gerbil-qOTe1nl2.d.mts} +2 -2
- package/dist/{gerbil-POAz8peb.d.mts.map → gerbil-qOTe1nl2.d.mts.map} +1 -1
- package/dist/index.d.mts +19 -3
- package/dist/index.d.mts.map +1 -1
- package/dist/index.mjs +6 -7
- package/dist/index.mjs.map +1 -1
- package/dist/integrations/ai-sdk.d.mts +1 -1
- package/dist/integrations/ai-sdk.mjs +4 -5
- package/dist/integrations/ai-sdk.mjs.map +1 -1
- package/dist/integrations/langchain.d.mts +1 -1
- package/dist/integrations/langchain.mjs +2 -3
- package/dist/integrations/langchain.mjs.map +1 -1
- package/dist/integrations/llamaindex.d.mts +1 -1
- package/dist/integrations/llamaindex.mjs +2 -3
- package/dist/integrations/llamaindex.mjs.map +1 -1
- package/dist/integrations/mcp-client.mjs +2 -2
- package/dist/integrations/mcp.d.mts +2 -2
- package/dist/integrations/mcp.mjs +5 -6
- package/dist/kokoro-BNTb6egA.mjs +20210 -0
- package/dist/kokoro-BNTb6egA.mjs.map +1 -0
- package/dist/{mcp-Bitg4sjX.mjs → mcp-kzDDWIoS.mjs} +3 -3
- package/dist/{mcp-Bitg4sjX.mjs.map → mcp-kzDDWIoS.mjs.map} +1 -1
- package/dist/{one-liner-B1rmFto6.mjs → one-liner-DxnNs_JK.mjs} +2 -2
- package/dist/{one-liner-B1rmFto6.mjs.map → one-liner-DxnNs_JK.mjs.map} +1 -1
- package/dist/repl-DGUw4fCc.mjs +9 -0
- package/dist/skills/index.d.mts +2 -2
- package/dist/skills/index.d.mts.map +1 -1
- package/dist/skills/index.mjs +4 -5
- package/dist/{skills-5DxAV-rn.mjs → skills-DulrOPeP.mjs} +12 -12
- package/dist/skills-DulrOPeP.mjs.map +1 -0
- package/dist/stt-1WIefHwc.mjs +3 -0
- package/dist/{stt-Bv_dum-R.mjs → stt-CG_7KB_0.mjs} +3 -2
- package/dist/stt-CG_7KB_0.mjs.map +1 -0
- package/dist/{tools-IYPrqoek.mjs → tools-Bi1P7Xoy.mjs} +2 -2
- package/dist/{tools-IYPrqoek.mjs.map → tools-Bi1P7Xoy.mjs.map} +1 -1
- package/dist/{tts-5yWeP_I0.mjs → tts-B1pZMlDv.mjs} +1 -1
- package/dist/{tts-DG6denWG.mjs → tts-CyHhcLtN.mjs} +6 -4
- package/dist/tts-CyHhcLtN.mjs.map +1 -0
- package/dist/{types-s6Py2_DL.d.mts → types-CiTc7ez3.d.mts} +1 -1
- package/dist/{types-s6Py2_DL.d.mts.map → types-CiTc7ez3.d.mts.map} +1 -1
- package/dist/{utils-CkB4Roi6.mjs → utils-CZBZ8dgR.mjs} +1 -1
- package/dist/{utils-CkB4Roi6.mjs.map → utils-CZBZ8dgR.mjs.map} +1 -1
- package/docs/architecture/overview.md +15 -7
- package/docs/tts.md +11 -8
- package/package.json +6 -6
- package/dist/browser/index.d.mts.map +0 -1
- package/dist/browser/index.mjs.map +0 -1
- package/dist/gerbil-DeQlX_Mt.mjs +0 -5
- package/dist/gerbil-yoSpRHgv.mjs.map +0 -1
- package/dist/models-BAtL8qsA.mjs +0 -171
- package/dist/models-BAtL8qsA.mjs.map +0 -1
- package/dist/models-CE0fBq0U.d.mts +0 -22
- package/dist/models-CE0fBq0U.d.mts.map +0 -1
- package/dist/repl-D20JO260.mjs +0 -10
- package/dist/skills-5DxAV-rn.mjs.map +0 -1
- package/dist/stt-Bv_dum-R.mjs.map +0 -1
- package/dist/stt-KzSoNvwI.mjs +0 -3
- package/dist/tts-DG6denWG.mjs.map +0 -1
- /package/dist/{auto-update-DsWBBnEk.mjs → auto-update-S9s5-g0C.mjs} +0 -0
- /package/dist/{chunk-Ct1HF2bE.mjs → chunk-CkXuGtQK.mjs} +0 -0
- /package/dist/{microphone-D-6y9aiE.mjs → microphone-DaMZFRuR.mjs} +0 -0
|
@@ -1,5 +1,115 @@
|
|
|
1
|
-
|
|
1
|
+
//#region src/core/models.ts
|
|
2
|
+
const BUILTIN_MODELS = {
|
|
3
|
+
"qwen3-0.6b": {
|
|
4
|
+
id: "qwen3-0.6b",
|
|
5
|
+
repo: "onnx-community/Qwen3-0.6B-ONNX",
|
|
6
|
+
description: "Qwen3 0.6B - Best balance of speed and quality, supports thinking",
|
|
7
|
+
size: "~400MB",
|
|
8
|
+
contextLength: 32768,
|
|
9
|
+
supportsThinking: true,
|
|
10
|
+
supportsJson: true,
|
|
11
|
+
family: "qwen"
|
|
12
|
+
},
|
|
13
|
+
"qwen2.5-0.5b": {
|
|
14
|
+
id: "qwen2.5-0.5b",
|
|
15
|
+
repo: "onnx-community/Qwen2.5-0.5B-Instruct",
|
|
16
|
+
description: "Qwen2.5 0.5B - Fast and capable",
|
|
17
|
+
size: "~350MB",
|
|
18
|
+
contextLength: 32768,
|
|
19
|
+
supportsThinking: false,
|
|
20
|
+
supportsJson: true,
|
|
21
|
+
family: "qwen"
|
|
22
|
+
},
|
|
23
|
+
"qwen2.5-coder-0.5b": {
|
|
24
|
+
id: "qwen2.5-coder-0.5b",
|
|
25
|
+
repo: "onnx-community/Qwen2.5-Coder-0.5B-Instruct",
|
|
26
|
+
description: "Qwen2.5 Coder 0.5B - Optimized for code",
|
|
27
|
+
size: "~400MB",
|
|
28
|
+
contextLength: 32768,
|
|
29
|
+
supportsThinking: false,
|
|
30
|
+
supportsJson: true,
|
|
31
|
+
family: "qwen"
|
|
32
|
+
},
|
|
33
|
+
"smollm2-360m": {
|
|
34
|
+
id: "smollm2-360m",
|
|
35
|
+
repo: "HuggingFaceTB/SmolLM2-360M-Instruct",
|
|
36
|
+
description: "SmolLM2 360M - Fast, good for simple tasks",
|
|
37
|
+
size: "~250MB",
|
|
38
|
+
contextLength: 8192,
|
|
39
|
+
supportsThinking: false,
|
|
40
|
+
supportsJson: false,
|
|
41
|
+
family: "smollm"
|
|
42
|
+
},
|
|
43
|
+
"smollm2-135m": {
|
|
44
|
+
id: "smollm2-135m",
|
|
45
|
+
repo: "HuggingFaceTB/SmolLM2-135M-Instruct",
|
|
46
|
+
description: "SmolLM2 135M - Fastest, basic generation",
|
|
47
|
+
size: "~100MB",
|
|
48
|
+
contextLength: 8192,
|
|
49
|
+
supportsThinking: false,
|
|
50
|
+
supportsJson: false,
|
|
51
|
+
family: "smollm"
|
|
52
|
+
},
|
|
53
|
+
"phi-3-mini": {
|
|
54
|
+
id: "phi-3-mini",
|
|
55
|
+
repo: "microsoft/Phi-3-mini-4k-instruct-onnx",
|
|
56
|
+
description: "Phi-3 Mini - High quality, larger model",
|
|
57
|
+
size: "~2.1GB",
|
|
58
|
+
contextLength: 4096,
|
|
59
|
+
supportsThinking: false,
|
|
60
|
+
supportsJson: true,
|
|
61
|
+
family: "phi"
|
|
62
|
+
},
|
|
63
|
+
"ministral-3b": {
|
|
64
|
+
id: "ministral-3b",
|
|
65
|
+
repo: "mistralai/Ministral-3-3B-Instruct-2512-ONNX",
|
|
66
|
+
description: "Ministral 3 3B - Vision + Reasoning, 256k context",
|
|
67
|
+
size: "~2.5GB",
|
|
68
|
+
contextLength: 262144,
|
|
69
|
+
supportsThinking: true,
|
|
70
|
+
supportsJson: true,
|
|
71
|
+
supportsVision: true,
|
|
72
|
+
visionEncoderSize: "0.4B",
|
|
73
|
+
family: "mistral"
|
|
74
|
+
}
|
|
75
|
+
};
|
|
76
|
+
/**
|
|
77
|
+
* Parse model identifier and resolve to source
|
|
78
|
+
*
|
|
79
|
+
* Supported formats:
|
|
80
|
+
* - "qwen3-0.6b" (built-in)
|
|
81
|
+
* - "hf:org/model" (HuggingFace shorthand)
|
|
82
|
+
* - "https://huggingface.co/org/model" (full URL)
|
|
83
|
+
* - "file:./path/to/model" (local path)
|
|
84
|
+
*/
|
|
85
|
+
function resolveModel(modelId) {
|
|
86
|
+
if (BUILTIN_MODELS[modelId]) return {
|
|
87
|
+
type: "builtin",
|
|
88
|
+
path: BUILTIN_MODELS[modelId].repo
|
|
89
|
+
};
|
|
90
|
+
if (modelId.startsWith("hf:")) return {
|
|
91
|
+
type: "huggingface",
|
|
92
|
+
path: modelId.slice(3)
|
|
93
|
+
};
|
|
94
|
+
if (modelId.startsWith("https://huggingface.co/")) return {
|
|
95
|
+
type: "huggingface",
|
|
96
|
+
path: modelId.replace("https://huggingface.co/", "")
|
|
97
|
+
};
|
|
98
|
+
if (modelId.startsWith("file:")) return {
|
|
99
|
+
type: "local",
|
|
100
|
+
path: modelId.slice(5)
|
|
101
|
+
};
|
|
102
|
+
if (modelId.includes("/")) return {
|
|
103
|
+
type: "huggingface",
|
|
104
|
+
path: modelId
|
|
105
|
+
};
|
|
106
|
+
return {
|
|
107
|
+
type: "huggingface",
|
|
108
|
+
path: modelId
|
|
109
|
+
};
|
|
110
|
+
}
|
|
2
111
|
|
|
112
|
+
//#endregion
|
|
3
113
|
//#region src/browser/index.ts
|
|
4
114
|
/**
|
|
5
115
|
* Gerbil Browser Support
|
|
@@ -1180,6 +1290,126 @@ const TTS_MODELS = {
|
|
|
1180
1290
|
voices: SUPERTONIC_BROWSER_VOICES
|
|
1181
1291
|
}
|
|
1182
1292
|
};
|
|
1293
|
+
const TTS_WORKER_CODE = `
|
|
1294
|
+
// TTS Worker - runs in separate thread, loads from CDN
|
|
1295
|
+
import { pipeline, env } from "https://cdn.jsdelivr.net/npm/@huggingface/transformers@3.8.1";
|
|
1296
|
+
|
|
1297
|
+
// Configure environment
|
|
1298
|
+
env.useBrowserCache = true;
|
|
1299
|
+
env.allowLocalModels = false;
|
|
1300
|
+
|
|
1301
|
+
let ttsInstance = null;
|
|
1302
|
+
let modelType = null; // "supertonic" or "kokoro"
|
|
1303
|
+
let voiceEmbeddings = new Map();
|
|
1304
|
+
let kokoroTTS = null;
|
|
1305
|
+
|
|
1306
|
+
self.onmessage = async (e) => {
|
|
1307
|
+
const { type, payload } = e.data;
|
|
1308
|
+
|
|
1309
|
+
if (type === "load") {
|
|
1310
|
+
try {
|
|
1311
|
+
const { modelId, repo, voices } = payload;
|
|
1312
|
+
modelType = modelId === "supertonic-66m" ? "supertonic" : "kokoro";
|
|
1313
|
+
|
|
1314
|
+
if (modelType === "supertonic") {
|
|
1315
|
+
// Load Supertonic using transformers.js pipeline
|
|
1316
|
+
ttsInstance = await pipeline("text-to-speech", repo, {
|
|
1317
|
+
device: "webgpu",
|
|
1318
|
+
progress_callback: (progress) => {
|
|
1319
|
+
self.postMessage({ type: "progress", payload: progress });
|
|
1320
|
+
},
|
|
1321
|
+
});
|
|
1322
|
+
|
|
1323
|
+
// Load voice embeddings
|
|
1324
|
+
for (const voice of voices) {
|
|
1325
|
+
try {
|
|
1326
|
+
const voiceUrl = "https://huggingface.co/" + repo + "/resolve/main/voices/" + voice.id + ".bin";
|
|
1327
|
+
const response = await fetch(voiceUrl);
|
|
1328
|
+
if (response.ok) {
|
|
1329
|
+
const buffer = await response.arrayBuffer();
|
|
1330
|
+
voiceEmbeddings.set(voice.id, new Float32Array(buffer));
|
|
1331
|
+
}
|
|
1332
|
+
} catch (err) {
|
|
1333
|
+
console.warn("Failed to load voice:", voice.id, err);
|
|
1334
|
+
}
|
|
1335
|
+
}
|
|
1336
|
+
|
|
1337
|
+
// Warmup
|
|
1338
|
+
try {
|
|
1339
|
+
await ttsInstance("Hello", {
|
|
1340
|
+
speaker_embeddings: new Float32Array(1 * 101 * 128),
|
|
1341
|
+
num_inference_steps: 1,
|
|
1342
|
+
speed: 1.0,
|
|
1343
|
+
});
|
|
1344
|
+
} catch (e) {
|
|
1345
|
+
console.warn("Warmup failed:", e);
|
|
1346
|
+
}
|
|
1347
|
+
} else {
|
|
1348
|
+
// Load Kokoro using kokoro-js from CDN
|
|
1349
|
+
const kokoroModule = await import("https://cdn.jsdelivr.net/npm/kokoro-js@1.2.1/dist/kokoro.web.min.js");
|
|
1350
|
+
const { KokoroTTS } = kokoroModule;
|
|
1351
|
+
|
|
1352
|
+
kokoroTTS = await KokoroTTS.from_pretrained(repo, {
|
|
1353
|
+
dtype: "fp32",
|
|
1354
|
+
progress_callback: (progress) => {
|
|
1355
|
+
self.postMessage({ type: "progress", payload: progress });
|
|
1356
|
+
},
|
|
1357
|
+
});
|
|
1358
|
+
}
|
|
1359
|
+
|
|
1360
|
+
self.postMessage({ type: "ready" });
|
|
1361
|
+
} catch (err) {
|
|
1362
|
+
self.postMessage({ type: "error", payload: err.message || String(err) });
|
|
1363
|
+
}
|
|
1364
|
+
}
|
|
1365
|
+
|
|
1366
|
+
if (type === "generate") {
|
|
1367
|
+
try {
|
|
1368
|
+
const { text, voice, speed } = payload;
|
|
1369
|
+
let audio, sampleRate;
|
|
1370
|
+
|
|
1371
|
+
if (modelType === "supertonic") {
|
|
1372
|
+
let embedding = voiceEmbeddings.get(voice);
|
|
1373
|
+
if (!embedding) {
|
|
1374
|
+
embedding = new Float32Array(101 * 128).fill(0.1);
|
|
1375
|
+
}
|
|
1376
|
+
|
|
1377
|
+
const result = await ttsInstance(text, {
|
|
1378
|
+
speaker_embeddings: embedding,
|
|
1379
|
+
speed: speed || 1.0,
|
|
1380
|
+
});
|
|
1381
|
+
|
|
1382
|
+
audio = result.audio;
|
|
1383
|
+
sampleRate = result.sampling_rate;
|
|
1384
|
+
} else {
|
|
1385
|
+
const result = await kokoroTTS.generate(text, {
|
|
1386
|
+
voice: voice,
|
|
1387
|
+
speed: speed || 1.0,
|
|
1388
|
+
});
|
|
1389
|
+
|
|
1390
|
+
audio = result.audio;
|
|
1391
|
+
sampleRate = result.sampling_rate;
|
|
1392
|
+
}
|
|
1393
|
+
|
|
1394
|
+
// Transfer audio data back
|
|
1395
|
+
self.postMessage(
|
|
1396
|
+
{ type: "audio", payload: { audio: audio, sampleRate: sampleRate } },
|
|
1397
|
+
[audio.buffer]
|
|
1398
|
+
);
|
|
1399
|
+
} catch (err) {
|
|
1400
|
+
self.postMessage({ type: "error", payload: err.message || String(err) });
|
|
1401
|
+
}
|
|
1402
|
+
}
|
|
1403
|
+
};
|
|
1404
|
+
`;
|
|
1405
|
+
/** Create TTS worker instance */
|
|
1406
|
+
function createTTSWorker() {
|
|
1407
|
+
const blob = new Blob([TTS_WORKER_CODE], { type: "application/javascript" });
|
|
1408
|
+
const url = URL.createObjectURL(blob);
|
|
1409
|
+
const worker = new Worker(url, { type: "module" });
|
|
1410
|
+
URL.revokeObjectURL(url);
|
|
1411
|
+
return worker;
|
|
1412
|
+
}
|
|
1183
1413
|
/**
|
|
1184
1414
|
* React hook for text-to-speech with Web Audio API playback
|
|
1185
1415
|
*
|
|
@@ -1229,17 +1459,17 @@ function useSpeech(options = {}) {
|
|
|
1229
1459
|
const [shouldLoad, setShouldLoad] = useState(autoLoad);
|
|
1230
1460
|
const [currentVoice, setCurrentVoice] = useState(defaultVoice);
|
|
1231
1461
|
const [currentSpeed, setCurrentSpeed] = useState(defaultSpeed);
|
|
1232
|
-
const
|
|
1233
|
-
const voiceEmbeddingsRef = useRef(/* @__PURE__ */ new Map());
|
|
1462
|
+
const workerRef = useRef(null);
|
|
1234
1463
|
const audioContextRef = useRef(null);
|
|
1235
1464
|
const sourceNodeRef = useRef(null);
|
|
1236
1465
|
const mountedRef = useRef(true);
|
|
1237
1466
|
const modelIdRef = useRef(modelId);
|
|
1467
|
+
const pendingSpeakRef = useRef(null);
|
|
1238
1468
|
const listVoices = useCallback(() => {
|
|
1239
1469
|
return modelConfig.voices;
|
|
1240
1470
|
}, [modelConfig.voices]);
|
|
1241
1471
|
const load = useCallback(() => {
|
|
1242
|
-
if (
|
|
1472
|
+
if (workerRef.current || isLoading) return;
|
|
1243
1473
|
setIsLoading(true);
|
|
1244
1474
|
setShouldLoad(true);
|
|
1245
1475
|
}, [isLoading]);
|
|
@@ -1247,86 +1477,48 @@ function useSpeech(options = {}) {
|
|
|
1247
1477
|
if (!shouldLoad) return;
|
|
1248
1478
|
mountedRef.current = true;
|
|
1249
1479
|
modelIdRef.current = modelId;
|
|
1250
|
-
const
|
|
1251
|
-
|
|
1252
|
-
|
|
1253
|
-
|
|
1254
|
-
|
|
1255
|
-
|
|
1256
|
-
|
|
1257
|
-
|
|
1258
|
-
|
|
1259
|
-
|
|
1260
|
-
|
|
1261
|
-
|
|
1262
|
-
|
|
1263
|
-
|
|
1264
|
-
|
|
1265
|
-
|
|
1266
|
-
file: progress.file,
|
|
1267
|
-
progress: Math.round(progress.progress || 0)
|
|
1268
|
-
});
|
|
1269
|
-
}
|
|
1270
|
-
});
|
|
1271
|
-
if (!mountedRef.current) return;
|
|
1272
|
-
const voicesUrl = `https://huggingface.co/${config.repo}/resolve/main/voices/`;
|
|
1273
|
-
const embeddingsMap = /* @__PURE__ */ new Map();
|
|
1274
|
-
await Promise.all(config.voices.map(async (voice) => {
|
|
1275
|
-
try {
|
|
1276
|
-
const response = await fetch(`${voicesUrl}${voice.id}.bin`);
|
|
1277
|
-
if (response.ok) {
|
|
1278
|
-
const buffer = await response.arrayBuffer();
|
|
1279
|
-
embeddingsMap.set(voice.id, new Float32Array(buffer));
|
|
1280
|
-
}
|
|
1281
|
-
} catch (e) {
|
|
1282
|
-
console.warn(`Failed to load voice embedding for ${voice.id}:`, e);
|
|
1283
|
-
}
|
|
1284
|
-
}));
|
|
1285
|
-
if (!mountedRef.current) return;
|
|
1286
|
-
try {
|
|
1287
|
-
await tts("Hello", {
|
|
1288
|
-
speaker_embeddings: new Float32Array(12928),
|
|
1289
|
-
num_inference_steps: 1,
|
|
1290
|
-
speed: 1
|
|
1291
|
-
});
|
|
1292
|
-
} catch (e) {
|
|
1293
|
-
console.warn("Supertonic warmup failed:", e);
|
|
1294
|
-
}
|
|
1295
|
-
voiceEmbeddingsRef.current = embeddingsMap;
|
|
1296
|
-
ttsRef.current = {
|
|
1297
|
-
type: "supertonic",
|
|
1298
|
-
pipeline: tts,
|
|
1299
|
-
config
|
|
1300
|
-
};
|
|
1301
|
-
} else {
|
|
1302
|
-
const { KokoroTTS } = await import("kokoro-js");
|
|
1303
|
-
const tts = await KokoroTTS.from_pretrained(config.repo, {
|
|
1304
|
-
dtype: "fp32",
|
|
1305
|
-
progress_callback: (progress) => {
|
|
1306
|
-
if (!mountedRef.current) return;
|
|
1307
|
-
if (progress.status === "progress" && progress.file) setLoadingProgress({
|
|
1308
|
-
status: "downloading",
|
|
1309
|
-
file: progress.file,
|
|
1310
|
-
progress: Math.round(progress.progress || 0)
|
|
1311
|
-
});
|
|
1312
|
-
}
|
|
1313
|
-
});
|
|
1314
|
-
if (!mountedRef.current) return;
|
|
1315
|
-
ttsRef.current = {
|
|
1316
|
-
type: "kokoro",
|
|
1317
|
-
instance: tts,
|
|
1318
|
-
config
|
|
1319
|
-
};
|
|
1320
|
-
}
|
|
1480
|
+
const config = TTS_MODELS[modelId];
|
|
1481
|
+
setLoadingProgress({
|
|
1482
|
+
status: "loading",
|
|
1483
|
+
message: `Loading ${modelId === "supertonic-66m" ? "Supertonic" : "Kokoro"} TTS...`
|
|
1484
|
+
});
|
|
1485
|
+
const worker = createTTSWorker();
|
|
1486
|
+
workerRef.current = worker;
|
|
1487
|
+
worker.onmessage = (e) => {
|
|
1488
|
+
if (!mountedRef.current) return;
|
|
1489
|
+
const { type, payload } = e.data;
|
|
1490
|
+
if (type === "progress" && payload.status === "progress" && payload.file) setLoadingProgress({
|
|
1491
|
+
status: "downloading",
|
|
1492
|
+
file: payload.file,
|
|
1493
|
+
progress: Math.round(payload.progress || 0)
|
|
1494
|
+
});
|
|
1495
|
+
if (type === "ready") {
|
|
1321
1496
|
setIsLoading(false);
|
|
1322
1497
|
setIsReady(true);
|
|
1323
1498
|
setLoadingProgress({ status: "ready" });
|
|
1324
1499
|
onReady?.();
|
|
1325
|
-
|
|
1326
|
-
|
|
1327
|
-
|
|
1500
|
+
if (pendingSpeakRef.current) {
|
|
1501
|
+
const { text, voice, speed } = pendingSpeakRef.current;
|
|
1502
|
+
pendingSpeakRef.current = null;
|
|
1503
|
+
worker.postMessage({
|
|
1504
|
+
type: "generate",
|
|
1505
|
+
payload: {
|
|
1506
|
+
text,
|
|
1507
|
+
voice,
|
|
1508
|
+
speed
|
|
1509
|
+
}
|
|
1510
|
+
});
|
|
1511
|
+
}
|
|
1512
|
+
}
|
|
1513
|
+
if (type === "audio") {
|
|
1514
|
+
const { audio, sampleRate } = payload;
|
|
1515
|
+
playAudioData(audio, sampleRate);
|
|
1516
|
+
}
|
|
1517
|
+
if (type === "error") {
|
|
1518
|
+
const errorMsg = payload;
|
|
1328
1519
|
setError(errorMsg);
|
|
1329
1520
|
setIsLoading(false);
|
|
1521
|
+
setIsSpeaking(false);
|
|
1330
1522
|
setLoadingProgress({
|
|
1331
1523
|
status: "error",
|
|
1332
1524
|
error: errorMsg
|
|
@@ -1334,9 +1526,29 @@ function useSpeech(options = {}) {
|
|
|
1334
1526
|
onError?.(errorMsg);
|
|
1335
1527
|
}
|
|
1336
1528
|
};
|
|
1337
|
-
|
|
1529
|
+
worker.onerror = (err) => {
|
|
1530
|
+
if (!mountedRef.current) return;
|
|
1531
|
+
const errorMsg = err.message || "Worker error";
|
|
1532
|
+
setError(errorMsg);
|
|
1533
|
+
setIsLoading(false);
|
|
1534
|
+
setLoadingProgress({
|
|
1535
|
+
status: "error",
|
|
1536
|
+
error: errorMsg
|
|
1537
|
+
});
|
|
1538
|
+
onError?.(errorMsg);
|
|
1539
|
+
};
|
|
1540
|
+
worker.postMessage({
|
|
1541
|
+
type: "load",
|
|
1542
|
+
payload: {
|
|
1543
|
+
modelId,
|
|
1544
|
+
repo: config.repo,
|
|
1545
|
+
voices: config.voices
|
|
1546
|
+
}
|
|
1547
|
+
});
|
|
1338
1548
|
return () => {
|
|
1339
1549
|
mountedRef.current = false;
|
|
1550
|
+
worker.terminate();
|
|
1551
|
+
workerRef.current = null;
|
|
1340
1552
|
};
|
|
1341
1553
|
}, [
|
|
1342
1554
|
shouldLoad,
|
|
@@ -1344,6 +1556,30 @@ function useSpeech(options = {}) {
|
|
|
1344
1556
|
onReady,
|
|
1345
1557
|
onError
|
|
1346
1558
|
]);
|
|
1559
|
+
const playAudioData = useCallback(async (audio, sampleRate) => {
|
|
1560
|
+
try {
|
|
1561
|
+
if (!audioContextRef.current || audioContextRef.current.state === "closed") audioContextRef.current = new AudioContext({ sampleRate });
|
|
1562
|
+
const ctx = audioContextRef.current;
|
|
1563
|
+
if (ctx.state === "suspended") await ctx.resume();
|
|
1564
|
+
const audioBuffer = ctx.createBuffer(1, audio.length, sampleRate);
|
|
1565
|
+
audioBuffer.copyToChannel(new Float32Array(audio), 0);
|
|
1566
|
+
const sourceNode = ctx.createBufferSource();
|
|
1567
|
+
sourceNode.buffer = audioBuffer;
|
|
1568
|
+
sourceNode.connect(ctx.destination);
|
|
1569
|
+
sourceNodeRef.current = sourceNode;
|
|
1570
|
+
sourceNode.onended = () => {
|
|
1571
|
+
if (!mountedRef.current) return;
|
|
1572
|
+
setIsSpeaking(false);
|
|
1573
|
+
onEnd?.();
|
|
1574
|
+
};
|
|
1575
|
+
sourceNode.start();
|
|
1576
|
+
} catch (err) {
|
|
1577
|
+
setIsSpeaking(false);
|
|
1578
|
+
const errorMsg = err instanceof Error ? err.message : String(err);
|
|
1579
|
+
setError(errorMsg);
|
|
1580
|
+
onError?.(errorMsg);
|
|
1581
|
+
}
|
|
1582
|
+
}, [onEnd, onError]);
|
|
1347
1583
|
useEffect(() => {
|
|
1348
1584
|
return () => {
|
|
1349
1585
|
try {
|
|
@@ -1358,89 +1594,46 @@ function useSpeech(options = {}) {
|
|
|
1358
1594
|
speak: useCallback(async (text, opts) => {
|
|
1359
1595
|
const voice = opts?.voice || currentVoice;
|
|
1360
1596
|
const speed = opts?.speed || currentSpeed;
|
|
1361
|
-
if (!
|
|
1597
|
+
if (!modelConfig.voices.find((v) => v.id === voice)) {
|
|
1598
|
+
const errorMsg = `Voice "${voice}" not found. Should be one of: ${modelConfig.voices.map((v) => v.id).join(", ")}.`;
|
|
1599
|
+
setError(errorMsg);
|
|
1600
|
+
onError?.(errorMsg);
|
|
1601
|
+
return;
|
|
1602
|
+
}
|
|
1603
|
+
if (!workerRef.current) {
|
|
1604
|
+
pendingSpeakRef.current = {
|
|
1605
|
+
text,
|
|
1606
|
+
voice,
|
|
1607
|
+
speed
|
|
1608
|
+
};
|
|
1362
1609
|
load();
|
|
1363
1610
|
return;
|
|
1364
1611
|
}
|
|
1365
|
-
|
|
1366
|
-
|
|
1367
|
-
|
|
1368
|
-
|
|
1369
|
-
|
|
1370
|
-
const ttsBackend = ttsRef.current;
|
|
1371
|
-
if (ttsBackend.type === "supertonic") {
|
|
1372
|
-
const config = ttsBackend.config;
|
|
1373
|
-
if (!config.voices.find((v) => v.id === voice)) {
|
|
1374
|
-
const validVoices = config.voices.map((v) => v.id).join(", ");
|
|
1375
|
-
throw new Error(`Voice "${voice}" not found. Should be one of: ${validVoices}.`);
|
|
1376
|
-
}
|
|
1377
|
-
let speakerEmbedding = voiceEmbeddingsRef.current.get(voice);
|
|
1378
|
-
if (!speakerEmbedding) try {
|
|
1379
|
-
const voiceUrl = `https://huggingface.co/${config.repo}/resolve/main/voices/${voice}.bin`;
|
|
1380
|
-
const response = await fetch(voiceUrl);
|
|
1381
|
-
if (response.ok) {
|
|
1382
|
-
const buffer = await response.arrayBuffer();
|
|
1383
|
-
speakerEmbedding = new Float32Array(buffer);
|
|
1384
|
-
voiceEmbeddingsRef.current.set(voice, speakerEmbedding);
|
|
1385
|
-
} else throw new Error(`Failed to load voice: ${response.status}`);
|
|
1386
|
-
} catch {
|
|
1387
|
-
speakerEmbedding = new Float32Array(12928).fill(.1);
|
|
1388
|
-
voiceEmbeddingsRef.current.set(voice, speakerEmbedding);
|
|
1389
|
-
}
|
|
1390
|
-
const result = await ttsBackend.pipeline(text, {
|
|
1391
|
-
speaker_embeddings: speakerEmbedding,
|
|
1392
|
-
speed
|
|
1393
|
-
});
|
|
1394
|
-
audioData = result.audio;
|
|
1395
|
-
sampleRate = result.sampling_rate;
|
|
1396
|
-
} else {
|
|
1397
|
-
const config = ttsBackend.config;
|
|
1398
|
-
if (!config.voices.find((v) => v.id === voice)) {
|
|
1399
|
-
const validVoices = config.voices.map((v) => v.id).join(", ");
|
|
1400
|
-
throw new Error(`Voice "${voice}" not found. Should be one of: ${validVoices}.`);
|
|
1401
|
-
}
|
|
1402
|
-
const result = await ttsBackend.instance.generate(text, {
|
|
1403
|
-
voice,
|
|
1404
|
-
speed
|
|
1405
|
-
});
|
|
1406
|
-
audioData = result.audio;
|
|
1407
|
-
sampleRate = result.sampling_rate;
|
|
1408
|
-
}
|
|
1409
|
-
if (!mountedRef.current) return;
|
|
1410
|
-
if (!audioContextRef.current || audioContextRef.current.state === "closed") audioContextRef.current = new AudioContext();
|
|
1411
|
-
const audioContext = audioContextRef.current;
|
|
1412
|
-
if (audioContext.state === "suspended") await audioContext.resume();
|
|
1413
|
-
const audioBuffer = audioContext.createBuffer(1, audioData.length, sampleRate);
|
|
1414
|
-
const channelData = new Float32Array(audioData);
|
|
1415
|
-
audioBuffer.copyToChannel(channelData, 0);
|
|
1416
|
-
if (sourceNodeRef.current) {
|
|
1417
|
-
sourceNodeRef.current.stop();
|
|
1418
|
-
sourceNodeRef.current.disconnect();
|
|
1419
|
-
}
|
|
1420
|
-
const sourceNode = audioContext.createBufferSource();
|
|
1421
|
-
sourceNode.buffer = audioBuffer;
|
|
1422
|
-
sourceNode.connect(audioContext.destination);
|
|
1423
|
-
sourceNode.onended = () => {
|
|
1424
|
-
if (mountedRef.current) {
|
|
1425
|
-
setIsSpeaking(false);
|
|
1426
|
-
onEnd?.();
|
|
1427
|
-
}
|
|
1612
|
+
if (!isReady) {
|
|
1613
|
+
pendingSpeakRef.current = {
|
|
1614
|
+
text,
|
|
1615
|
+
voice,
|
|
1616
|
+
speed
|
|
1428
1617
|
};
|
|
1429
|
-
|
|
1430
|
-
sourceNode.start();
|
|
1431
|
-
} catch (err) {
|
|
1432
|
-
if (!mountedRef.current) return;
|
|
1433
|
-
const errorMsg = err instanceof Error ? err.message : String(err);
|
|
1434
|
-
setError(errorMsg);
|
|
1435
|
-
setIsSpeaking(false);
|
|
1436
|
-
onError?.(errorMsg);
|
|
1618
|
+
return;
|
|
1437
1619
|
}
|
|
1620
|
+
setIsSpeaking(true);
|
|
1621
|
+
onStart?.();
|
|
1622
|
+
workerRef.current.postMessage({
|
|
1623
|
+
type: "generate",
|
|
1624
|
+
payload: {
|
|
1625
|
+
text,
|
|
1626
|
+
voice,
|
|
1627
|
+
speed
|
|
1628
|
+
}
|
|
1629
|
+
});
|
|
1438
1630
|
}, [
|
|
1439
1631
|
currentVoice,
|
|
1440
1632
|
currentSpeed,
|
|
1633
|
+
modelConfig.voices,
|
|
1441
1634
|
load,
|
|
1635
|
+
isReady,
|
|
1442
1636
|
onStart,
|
|
1443
|
-
onEnd,
|
|
1444
1637
|
onError
|
|
1445
1638
|
]),
|
|
1446
1639
|
stop: useCallback(() => {
|
|
@@ -1563,6 +1756,61 @@ function createAudioPlayer(sampleRate = 24e3) {
|
|
|
1563
1756
|
isPlaying: () => isActive
|
|
1564
1757
|
};
|
|
1565
1758
|
}
|
|
1759
|
+
const STT_WORKER_CODE = `
|
|
1760
|
+
// STT Worker - runs in separate thread, loads from CDN
|
|
1761
|
+
import { pipeline, env } from "https://cdn.jsdelivr.net/npm/@huggingface/transformers@3.8.1";
|
|
1762
|
+
|
|
1763
|
+
// Configure environment
|
|
1764
|
+
env.useBrowserCache = true;
|
|
1765
|
+
env.allowLocalModels = false;
|
|
1766
|
+
|
|
1767
|
+
let sttPipeline = null;
|
|
1768
|
+
|
|
1769
|
+
self.onmessage = async (e) => {
|
|
1770
|
+
const { type, payload } = e.data;
|
|
1771
|
+
|
|
1772
|
+
if (type === "load") {
|
|
1773
|
+
try {
|
|
1774
|
+
const { model } = payload;
|
|
1775
|
+
|
|
1776
|
+
// Load Whisper model
|
|
1777
|
+
sttPipeline = await pipeline("automatic-speech-recognition", model, {
|
|
1778
|
+
device: "webgpu",
|
|
1779
|
+
progress_callback: (progress) => {
|
|
1780
|
+
self.postMessage({ type: "progress", payload: progress });
|
|
1781
|
+
},
|
|
1782
|
+
});
|
|
1783
|
+
|
|
1784
|
+
self.postMessage({ type: "ready" });
|
|
1785
|
+
} catch (err) {
|
|
1786
|
+
self.postMessage({ type: "error", payload: err.message || String(err) });
|
|
1787
|
+
}
|
|
1788
|
+
}
|
|
1789
|
+
|
|
1790
|
+
if (type === "transcribe") {
|
|
1791
|
+
try {
|
|
1792
|
+
const { audio } = payload;
|
|
1793
|
+
|
|
1794
|
+
// Run transcription
|
|
1795
|
+
const result = await sttPipeline(audio, {
|
|
1796
|
+
return_timestamps: false,
|
|
1797
|
+
});
|
|
1798
|
+
|
|
1799
|
+
self.postMessage({ type: "transcript", payload: result.text || "" });
|
|
1800
|
+
} catch (err) {
|
|
1801
|
+
self.postMessage({ type: "error", payload: err.message || String(err) });
|
|
1802
|
+
}
|
|
1803
|
+
}
|
|
1804
|
+
};
|
|
1805
|
+
`;
|
|
1806
|
+
/** Create STT worker instance */
|
|
1807
|
+
function createSTTWorker() {
|
|
1808
|
+
const blob = new Blob([STT_WORKER_CODE], { type: "application/javascript" });
|
|
1809
|
+
const url = URL.createObjectURL(blob);
|
|
1810
|
+
const worker = new Worker(url, { type: "module" });
|
|
1811
|
+
URL.revokeObjectURL(url);
|
|
1812
|
+
return worker;
|
|
1813
|
+
}
|
|
1566
1814
|
/**
|
|
1567
1815
|
* React hook for voice input with browser microphone
|
|
1568
1816
|
*
|
|
@@ -1620,7 +1868,7 @@ function useVoiceInput(options = {}) {
|
|
|
1620
1868
|
const [chunkCount, setChunkCount] = useState(0);
|
|
1621
1869
|
const [error, setError] = useState(null);
|
|
1622
1870
|
const [shouldLoad, setShouldLoad] = useState(autoLoad);
|
|
1623
|
-
const
|
|
1871
|
+
const workerRef = useRef(null);
|
|
1624
1872
|
const mediaRecorderRef = useRef(null);
|
|
1625
1873
|
const audioChunksRef = useRef([]);
|
|
1626
1874
|
const streamRef = useRef(null);
|
|
@@ -1628,49 +1876,66 @@ function useVoiceInput(options = {}) {
|
|
|
1628
1876
|
const streamingIntervalRef = useRef(null);
|
|
1629
1877
|
const pendingChunksRef = useRef([]);
|
|
1630
1878
|
const fullTranscriptRef = useRef("");
|
|
1879
|
+
const transcribeResolveRef = useRef(null);
|
|
1880
|
+
const transcribeRejectRef = useRef(null);
|
|
1881
|
+
const resolveSTTModel = (modelId) => {
|
|
1882
|
+
return {
|
|
1883
|
+
"whisper-tiny": "onnx-community/whisper-tiny",
|
|
1884
|
+
"whisper-tiny.en": "onnx-community/whisper-tiny.en",
|
|
1885
|
+
"whisper-base": "onnx-community/whisper-base",
|
|
1886
|
+
"whisper-base.en": "onnx-community/whisper-base.en",
|
|
1887
|
+
"whisper-small": "onnx-community/whisper-small",
|
|
1888
|
+
"whisper-small.en": "onnx-community/whisper-small.en"
|
|
1889
|
+
}[modelId] || modelId;
|
|
1890
|
+
};
|
|
1631
1891
|
useEffect(() => {
|
|
1632
1892
|
if (!shouldLoad || isReady) return;
|
|
1633
|
-
|
|
1634
|
-
|
|
1635
|
-
|
|
1636
|
-
|
|
1637
|
-
|
|
1638
|
-
|
|
1639
|
-
|
|
1640
|
-
|
|
1641
|
-
|
|
1642
|
-
|
|
1643
|
-
|
|
1644
|
-
|
|
1645
|
-
|
|
1646
|
-
|
|
1647
|
-
|
|
1648
|
-
|
|
1649
|
-
|
|
1650
|
-
|
|
1651
|
-
|
|
1652
|
-
|
|
1653
|
-
|
|
1654
|
-
|
|
1655
|
-
|
|
1656
|
-
|
|
1657
|
-
|
|
1658
|
-
|
|
1659
|
-
if (cancelled || !mountedRef.current) {
|
|
1660
|
-
stt.dispose();
|
|
1661
|
-
return;
|
|
1662
|
-
}
|
|
1663
|
-
sttRef.current = stt;
|
|
1893
|
+
mountedRef.current = true;
|
|
1894
|
+
setIsLoading(true);
|
|
1895
|
+
setLoadingProgress({
|
|
1896
|
+
status: "loading",
|
|
1897
|
+
message: "Loading STT model..."
|
|
1898
|
+
});
|
|
1899
|
+
onProgress?.({
|
|
1900
|
+
status: "loading",
|
|
1901
|
+
message: "Loading STT model..."
|
|
1902
|
+
});
|
|
1903
|
+
const worker = createSTTWorker();
|
|
1904
|
+
workerRef.current = worker;
|
|
1905
|
+
worker.onmessage = (e) => {
|
|
1906
|
+
if (!mountedRef.current) return;
|
|
1907
|
+
const { type, payload } = e.data;
|
|
1908
|
+
if (type === "progress") {
|
|
1909
|
+
const progress = {
|
|
1910
|
+
status: payload.progress !== void 0 ? "downloading" : "loading",
|
|
1911
|
+
message: payload.status,
|
|
1912
|
+
progress: payload.progress,
|
|
1913
|
+
file: payload.file
|
|
1914
|
+
};
|
|
1915
|
+
setLoadingProgress(progress);
|
|
1916
|
+
onProgress?.(progress);
|
|
1917
|
+
}
|
|
1918
|
+
if (type === "ready") {
|
|
1664
1919
|
setIsReady(true);
|
|
1665
1920
|
setIsLoading(false);
|
|
1666
1921
|
setLoadingProgress({ status: "ready" });
|
|
1667
1922
|
onProgress?.({ status: "ready" });
|
|
1668
1923
|
onReady?.();
|
|
1669
|
-
}
|
|
1670
|
-
|
|
1671
|
-
const
|
|
1924
|
+
}
|
|
1925
|
+
if (type === "transcript") {
|
|
1926
|
+
const text = payload;
|
|
1927
|
+
setIsTranscribing(false);
|
|
1928
|
+
if (transcribeResolveRef.current) {
|
|
1929
|
+
transcribeResolveRef.current(text);
|
|
1930
|
+
transcribeResolveRef.current = null;
|
|
1931
|
+
transcribeRejectRef.current = null;
|
|
1932
|
+
}
|
|
1933
|
+
}
|
|
1934
|
+
if (type === "error") {
|
|
1935
|
+
const errMsg = payload;
|
|
1672
1936
|
setError(errMsg);
|
|
1673
1937
|
setIsLoading(false);
|
|
1938
|
+
setIsTranscribing(false);
|
|
1674
1939
|
setLoadingProgress({
|
|
1675
1940
|
status: "error",
|
|
1676
1941
|
message: errMsg
|
|
@@ -1680,11 +1945,36 @@ function useVoiceInput(options = {}) {
|
|
|
1680
1945
|
message: errMsg
|
|
1681
1946
|
});
|
|
1682
1947
|
onError?.(errMsg);
|
|
1948
|
+
if (transcribeRejectRef.current) {
|
|
1949
|
+
transcribeRejectRef.current(new Error(errMsg));
|
|
1950
|
+
transcribeResolveRef.current = null;
|
|
1951
|
+
transcribeRejectRef.current = null;
|
|
1952
|
+
}
|
|
1683
1953
|
}
|
|
1684
1954
|
};
|
|
1685
|
-
|
|
1955
|
+
worker.onerror = (err) => {
|
|
1956
|
+
if (!mountedRef.current) return;
|
|
1957
|
+
const errMsg = err.message || "Worker error";
|
|
1958
|
+
setError(errMsg);
|
|
1959
|
+
setIsLoading(false);
|
|
1960
|
+
setLoadingProgress({
|
|
1961
|
+
status: "error",
|
|
1962
|
+
message: errMsg
|
|
1963
|
+
});
|
|
1964
|
+
onProgress?.({
|
|
1965
|
+
status: "error",
|
|
1966
|
+
message: errMsg
|
|
1967
|
+
});
|
|
1968
|
+
onError?.(errMsg);
|
|
1969
|
+
};
|
|
1970
|
+
worker.postMessage({
|
|
1971
|
+
type: "load",
|
|
1972
|
+
payload: { model: resolveSTTModel(model) }
|
|
1973
|
+
});
|
|
1686
1974
|
return () => {
|
|
1687
|
-
|
|
1975
|
+
mountedRef.current = false;
|
|
1976
|
+
worker.terminate();
|
|
1977
|
+
workerRef.current = null;
|
|
1688
1978
|
};
|
|
1689
1979
|
}, [
|
|
1690
1980
|
shouldLoad,
|
|
@@ -1698,7 +1988,10 @@ function useVoiceInput(options = {}) {
|
|
|
1698
1988
|
mountedRef.current = true;
|
|
1699
1989
|
return () => {
|
|
1700
1990
|
mountedRef.current = false;
|
|
1701
|
-
if (
|
|
1991
|
+
if (workerRef.current) {
|
|
1992
|
+
workerRef.current.terminate();
|
|
1993
|
+
workerRef.current = null;
|
|
1994
|
+
}
|
|
1702
1995
|
if (streamRef.current) for (const track of streamRef.current.getTracks()) track.stop();
|
|
1703
1996
|
};
|
|
1704
1997
|
}, []);
|
|
@@ -1732,27 +2025,38 @@ function useVoiceInput(options = {}) {
|
|
|
1732
2025
|
return new Float32Array(channelData);
|
|
1733
2026
|
}, []);
|
|
1734
2027
|
const transcribe = useCallback(async (audio) => {
|
|
1735
|
-
if (!
|
|
2028
|
+
if (!workerRef.current) {
|
|
1736
2029
|
if (!shouldLoad) {
|
|
1737
2030
|
setShouldLoad(true);
|
|
1738
2031
|
throw new Error("STT model not loaded. Loading now, please try again.");
|
|
1739
2032
|
}
|
|
1740
2033
|
throw new Error("STT model not loaded");
|
|
1741
2034
|
}
|
|
2035
|
+
if (!isReady) throw new Error("STT model still loading");
|
|
1742
2036
|
setIsTranscribing(true);
|
|
1743
|
-
|
|
1744
|
-
|
|
1745
|
-
|
|
1746
|
-
|
|
1747
|
-
|
|
1748
|
-
|
|
1749
|
-
|
|
1750
|
-
|
|
1751
|
-
|
|
1752
|
-
|
|
2037
|
+
return new Promise((resolve, reject) => {
|
|
2038
|
+
transcribeResolveRef.current = (text) => {
|
|
2039
|
+
let filtered = text.trim();
|
|
2040
|
+
if (filtered === "[BLANK_AUDIO]" || filtered === "(blank audio)" || filtered === "[BLANK AUDIO]") filtered = "";
|
|
2041
|
+
setTranscript(filtered);
|
|
2042
|
+
onTranscript?.(filtered);
|
|
2043
|
+
resolve(filtered);
|
|
2044
|
+
};
|
|
2045
|
+
transcribeRejectRef.current = reject;
|
|
2046
|
+
const audioArray = new Float32Array(audio);
|
|
2047
|
+
workerRef.current.postMessage({
|
|
2048
|
+
type: "transcribe",
|
|
2049
|
+
payload: { audio: audioArray }
|
|
2050
|
+
}, [audioArray.buffer]);
|
|
2051
|
+
});
|
|
2052
|
+
}, [
|
|
2053
|
+
shouldLoad,
|
|
2054
|
+
isReady,
|
|
2055
|
+
onTranscript
|
|
2056
|
+
]);
|
|
1753
2057
|
const processedSamplesRef = useRef(0);
|
|
1754
2058
|
const transcribeChunk = useCallback(async (chunkIdx) => {
|
|
1755
|
-
if (!
|
|
2059
|
+
if (!workerRef.current || !isReady || audioChunksRef.current.length === 0) return "";
|
|
1756
2060
|
try {
|
|
1757
2061
|
const audioData = await blobToFloat32(new Blob(audioChunksRef.current, { type: "audio/webm" }));
|
|
1758
2062
|
const newSamplesStart = processedSamplesRef.current;
|
|
@@ -1760,8 +2064,7 @@ function useVoiceInput(options = {}) {
|
|
|
1760
2064
|
if (totalSamples - newSamplesStart < 8e3) return "";
|
|
1761
2065
|
const newAudio = audioData.slice(newSamplesStart);
|
|
1762
2066
|
processedSamplesRef.current = totalSamples;
|
|
1763
|
-
|
|
1764
|
-
if (text === "[BLANK_AUDIO]" || text === "(blank audio)" || text === "[BLANK AUDIO]") text = "";
|
|
2067
|
+
const text = await transcribe(newAudio);
|
|
1765
2068
|
if (text && mountedRef.current) {
|
|
1766
2069
|
setStreamingChunk(text);
|
|
1767
2070
|
onChunk?.(text, chunkIdx);
|
|
@@ -1770,38 +2073,30 @@ function useVoiceInput(options = {}) {
|
|
|
1770
2073
|
} catch {
|
|
1771
2074
|
return "";
|
|
1772
2075
|
}
|
|
1773
|
-
}, [
|
|
2076
|
+
}, [
|
|
2077
|
+
blobToFloat32,
|
|
2078
|
+
isReady,
|
|
2079
|
+
transcribe,
|
|
2080
|
+
onChunk
|
|
2081
|
+
]);
|
|
1774
2082
|
return {
|
|
1775
2083
|
startRecording: useCallback(async () => {
|
|
1776
2084
|
if (isRecording) return;
|
|
1777
2085
|
try {
|
|
1778
|
-
if (streaming && !
|
|
2086
|
+
if (streaming && !isReady) {
|
|
1779
2087
|
if (!shouldLoad) setShouldLoad(true);
|
|
1780
|
-
|
|
1781
|
-
|
|
1782
|
-
|
|
1783
|
-
|
|
1784
|
-
|
|
1785
|
-
|
|
1786
|
-
|
|
1787
|
-
|
|
1788
|
-
|
|
1789
|
-
|
|
1790
|
-
|
|
1791
|
-
|
|
1792
|
-
onProgress?.(progress);
|
|
1793
|
-
}
|
|
1794
|
-
} });
|
|
1795
|
-
if (!mountedRef.current) {
|
|
1796
|
-
stt.dispose();
|
|
1797
|
-
return;
|
|
1798
|
-
}
|
|
1799
|
-
sttRef.current = stt;
|
|
1800
|
-
setIsReady(true);
|
|
1801
|
-
setIsLoading(false);
|
|
1802
|
-
setLoadingProgress({ status: "ready" });
|
|
1803
|
-
onProgress?.({ status: "ready" });
|
|
1804
|
-
onReady?.();
|
|
2088
|
+
await new Promise((resolve, reject) => {
|
|
2089
|
+
const checkReady = setInterval(() => {
|
|
2090
|
+
if (isReady && workerRef.current) {
|
|
2091
|
+
clearInterval(checkReady);
|
|
2092
|
+
resolve();
|
|
2093
|
+
}
|
|
2094
|
+
}, 100);
|
|
2095
|
+
setTimeout(() => {
|
|
2096
|
+
clearInterval(checkReady);
|
|
2097
|
+
reject(/* @__PURE__ */ new Error("Timeout waiting for STT model"));
|
|
2098
|
+
}, 6e4);
|
|
2099
|
+
});
|
|
1805
2100
|
}
|
|
1806
2101
|
const stream = await navigator.mediaDevices.getUserMedia({ audio: {
|
|
1807
2102
|
sampleRate: 16e3,
|
|
@@ -1828,7 +2123,7 @@ function useVoiceInput(options = {}) {
|
|
|
1828
2123
|
mediaRecorder.start(100);
|
|
1829
2124
|
setIsRecording(true);
|
|
1830
2125
|
setError(null);
|
|
1831
|
-
if (streaming &&
|
|
2126
|
+
if (streaming && isReady && workerRef.current) {
|
|
1832
2127
|
let chunkIdx = 0;
|
|
1833
2128
|
let shouldContinue = true;
|
|
1834
2129
|
const processNextChunk = async () => {
|
|
@@ -1918,11 +2213,11 @@ function useVoiceInput(options = {}) {
|
|
|
1918
2213
|
}
|
|
1919
2214
|
const audioBlob = new Blob(audioChunksRef.current, { type: "audio/webm" });
|
|
1920
2215
|
try {
|
|
1921
|
-
if (!
|
|
2216
|
+
if (!isReady || !workerRef.current) {
|
|
1922
2217
|
if (!shouldLoad) setShouldLoad(true);
|
|
1923
2218
|
await new Promise((res, rej) => {
|
|
1924
2219
|
const checkReady = setInterval(() => {
|
|
1925
|
-
if (
|
|
2220
|
+
if (isReady && workerRef.current) {
|
|
1926
2221
|
clearInterval(checkReady);
|
|
1927
2222
|
res();
|
|
1928
2223
|
}
|
|
@@ -2053,6 +2348,16 @@ function useVoiceChat(options = {}) {
|
|
|
2053
2348
|
const isListening = stage === "listening";
|
|
2054
2349
|
const isProcessing = stage === "transcribing" || stage === "thinking";
|
|
2055
2350
|
const isSpeaking = stage === "speaking";
|
|
2351
|
+
const resolveSTTModel = (modelId) => {
|
|
2352
|
+
return {
|
|
2353
|
+
"whisper-tiny": "onnx-community/whisper-tiny",
|
|
2354
|
+
"whisper-tiny.en": "onnx-community/whisper-tiny.en",
|
|
2355
|
+
"whisper-base": "onnx-community/whisper-base",
|
|
2356
|
+
"whisper-base.en": "onnx-community/whisper-base.en",
|
|
2357
|
+
"whisper-small": "onnx-community/whisper-small",
|
|
2358
|
+
"whisper-small.en": "onnx-community/whisper-small.en"
|
|
2359
|
+
}[modelId] || modelId;
|
|
2360
|
+
};
|
|
2056
2361
|
useEffect(() => {
|
|
2057
2362
|
if (!shouldLoad || isReady) return;
|
|
2058
2363
|
let cancelled = false;
|
|
@@ -2061,18 +2366,29 @@ function useVoiceChat(options = {}) {
|
|
|
2061
2366
|
setIsLoading(true);
|
|
2062
2367
|
setError(null);
|
|
2063
2368
|
setLoadingMessage("Loading speech recognition (Whisper)...");
|
|
2064
|
-
const
|
|
2065
|
-
if (cancelled || !mountedRef.current) return;
|
|
2066
|
-
const stt = new WhisperSTT(sttModel);
|
|
2067
|
-
await stt.load({ onProgress: (p) => {
|
|
2068
|
-
if (!mountedRef.current) return;
|
|
2069
|
-
setLoadingMessage(p.status || "Loading STT...");
|
|
2070
|
-
} });
|
|
2369
|
+
const sttWorker = createSTTWorker();
|
|
2071
2370
|
if (cancelled || !mountedRef.current) {
|
|
2072
|
-
|
|
2371
|
+
sttWorker.terminate();
|
|
2073
2372
|
return;
|
|
2074
2373
|
}
|
|
2075
|
-
|
|
2374
|
+
await new Promise((resolve, reject) => {
|
|
2375
|
+
sttWorker.onmessage = (e) => {
|
|
2376
|
+
const { type, payload } = e.data;
|
|
2377
|
+
if (type === "ready") resolve();
|
|
2378
|
+
if (type === "error") reject(new Error(payload));
|
|
2379
|
+
if (type === "progress" && mountedRef.current) setLoadingMessage(payload.status || "Loading STT...");
|
|
2380
|
+
};
|
|
2381
|
+
sttWorker.onerror = (e) => reject(new Error(e.message));
|
|
2382
|
+
sttWorker.postMessage({
|
|
2383
|
+
type: "load",
|
|
2384
|
+
payload: { model: resolveSTTModel(sttModel) }
|
|
2385
|
+
});
|
|
2386
|
+
});
|
|
2387
|
+
if (cancelled || !mountedRef.current) {
|
|
2388
|
+
sttWorker.terminate();
|
|
2389
|
+
return;
|
|
2390
|
+
}
|
|
2391
|
+
sttRef.current = sttWorker;
|
|
2076
2392
|
setLoadingMessage("Loading language model...");
|
|
2077
2393
|
const worker = await createGerbilWorker({
|
|
2078
2394
|
modelId: llmModel,
|
|
@@ -2087,18 +2403,34 @@ function useVoiceChat(options = {}) {
|
|
|
2087
2403
|
}
|
|
2088
2404
|
llmWorkerRef.current = worker;
|
|
2089
2405
|
setLoadingMessage(`Loading text-to-speech (${ttsModelId === "supertonic-66m" ? "Supertonic" : "Kokoro"})...`);
|
|
2090
|
-
const
|
|
2091
|
-
if (cancelled || !mountedRef.current)
|
|
2092
|
-
|
|
2093
|
-
|
|
2094
|
-
|
|
2095
|
-
|
|
2096
|
-
|
|
2406
|
+
const ttsWorker = createTTSWorker();
|
|
2407
|
+
if (cancelled || !mountedRef.current) {
|
|
2408
|
+
ttsWorker.terminate();
|
|
2409
|
+
return;
|
|
2410
|
+
}
|
|
2411
|
+
const ttsConfig$1 = TTS_MODELS[ttsModelId];
|
|
2412
|
+
await new Promise((resolve, reject) => {
|
|
2413
|
+
ttsWorker.onmessage = (e) => {
|
|
2414
|
+
const { type, payload } = e.data;
|
|
2415
|
+
if (type === "ready") resolve();
|
|
2416
|
+
if (type === "error") reject(new Error(payload));
|
|
2417
|
+
if (type === "progress" && mountedRef.current) setLoadingMessage(payload.status || "Loading TTS...");
|
|
2418
|
+
};
|
|
2419
|
+
ttsWorker.onerror = (e) => reject(new Error(e.message));
|
|
2420
|
+
ttsWorker.postMessage({
|
|
2421
|
+
type: "load",
|
|
2422
|
+
payload: {
|
|
2423
|
+
modelId: ttsModelId,
|
|
2424
|
+
repo: ttsConfig$1.repo,
|
|
2425
|
+
voices: ttsConfig$1.voices
|
|
2426
|
+
}
|
|
2427
|
+
});
|
|
2428
|
+
});
|
|
2097
2429
|
if (cancelled || !mountedRef.current) {
|
|
2098
|
-
|
|
2430
|
+
ttsWorker.terminate();
|
|
2099
2431
|
return;
|
|
2100
2432
|
}
|
|
2101
|
-
ttsRef.current =
|
|
2433
|
+
ttsRef.current = ttsWorker;
|
|
2102
2434
|
setIsReady(true);
|
|
2103
2435
|
setIsLoading(false);
|
|
2104
2436
|
setLoadingMessage("Ready!");
|
|
@@ -2127,8 +2459,8 @@ function useVoiceChat(options = {}) {
|
|
|
2127
2459
|
return () => {
|
|
2128
2460
|
mountedRef.current = false;
|
|
2129
2461
|
llmWorkerRef.current?.terminate();
|
|
2130
|
-
sttRef.current?.
|
|
2131
|
-
ttsRef.current?.
|
|
2462
|
+
sttRef.current?.terminate();
|
|
2463
|
+
ttsRef.current?.terminate();
|
|
2132
2464
|
if (streamRef.current) for (const track of streamRef.current.getTracks()) track.stop();
|
|
2133
2465
|
audioContextRef.current?.close();
|
|
2134
2466
|
};
|
|
@@ -2234,7 +2566,26 @@ function useVoiceChat(options = {}) {
|
|
|
2234
2566
|
try {
|
|
2235
2567
|
setStage("transcribing");
|
|
2236
2568
|
const audioData = await blobToFloat32(audioBlob);
|
|
2237
|
-
let userText =
|
|
2569
|
+
let userText = await new Promise((sttResolve, sttReject) => {
|
|
2570
|
+
const handler = (e) => {
|
|
2571
|
+
const { type, payload } = e.data;
|
|
2572
|
+
if (type === "transcript") {
|
|
2573
|
+
sttRef.current?.removeEventListener("message", handler);
|
|
2574
|
+
sttResolve(payload);
|
|
2575
|
+
}
|
|
2576
|
+
if (type === "error") {
|
|
2577
|
+
sttRef.current?.removeEventListener("message", handler);
|
|
2578
|
+
sttReject(new Error(payload));
|
|
2579
|
+
}
|
|
2580
|
+
};
|
|
2581
|
+
sttRef.current?.addEventListener("message", handler);
|
|
2582
|
+
const audioArray = new Float32Array(audioData);
|
|
2583
|
+
sttRef.current?.postMessage({
|
|
2584
|
+
type: "transcribe",
|
|
2585
|
+
payload: { audio: audioArray }
|
|
2586
|
+
}, [audioArray.buffer]);
|
|
2587
|
+
});
|
|
2588
|
+
userText = userText.trim();
|
|
2238
2589
|
if (userText === "[BLANK_AUDIO]" || userText === "(blank audio)" || userText === "[BLANK AUDIO]") userText = "";
|
|
2239
2590
|
if (cancelledRef.current || !userText) {
|
|
2240
2591
|
setStage("idle");
|
|
@@ -2284,9 +2635,30 @@ function useVoiceChat(options = {}) {
|
|
|
2284
2635
|
onAssistantSpeak?.(responseText);
|
|
2285
2636
|
if (responseText.trim()) {
|
|
2286
2637
|
setStage("speaking");
|
|
2287
|
-
const ttsResult = await
|
|
2288
|
-
|
|
2289
|
-
|
|
2638
|
+
const ttsResult = await new Promise((ttsResolve, ttsReject) => {
|
|
2639
|
+
const handler = (e) => {
|
|
2640
|
+
const { type, payload } = e.data;
|
|
2641
|
+
if (type === "audio") {
|
|
2642
|
+
ttsRef.current?.removeEventListener("message", handler);
|
|
2643
|
+
ttsResolve({
|
|
2644
|
+
audio: payload.audio,
|
|
2645
|
+
sampleRate: payload.sampleRate
|
|
2646
|
+
});
|
|
2647
|
+
}
|
|
2648
|
+
if (type === "error") {
|
|
2649
|
+
ttsRef.current?.removeEventListener("message", handler);
|
|
2650
|
+
ttsReject(new Error(payload));
|
|
2651
|
+
}
|
|
2652
|
+
};
|
|
2653
|
+
ttsRef.current?.addEventListener("message", handler);
|
|
2654
|
+
ttsRef.current?.postMessage({
|
|
2655
|
+
type: "generate",
|
|
2656
|
+
payload: {
|
|
2657
|
+
text: responseText,
|
|
2658
|
+
voice,
|
|
2659
|
+
speed
|
|
2660
|
+
}
|
|
2661
|
+
});
|
|
2290
2662
|
});
|
|
2291
2663
|
if (!cancelledRef.current) await playAudioBuffer(ttsResult.audio, ttsResult.sampleRate);
|
|
2292
2664
|
}
|
|
@@ -2378,4 +2750,4 @@ var browser_default = {
|
|
|
2378
2750
|
|
|
2379
2751
|
//#endregion
|
|
2380
2752
|
export { BUILTIN_MODELS, createAudioPlayer, createGerbilWorker, browser_default as default, getWebGPUInfo, isWebGPUSupported, playAudio, useChat, useCompletion, useSpeech, useVoiceChat, useVoiceInput };
|
|
2381
|
-
//# sourceMappingURL=index.
|
|
2753
|
+
//# sourceMappingURL=index.js.map
|