@tryhamster/gerbil 1.0.0-rc.1 → 1.0.0-rc.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (90) hide show
  1. package/README.md +14 -5
  2. package/dist/browser/{index.d.mts → index.d.ts} +354 -3
  3. package/dist/browser/index.d.ts.map +1 -0
  4. package/dist/browser/{index.mjs → index.js} +631 -259
  5. package/dist/browser/index.js.map +1 -0
  6. package/dist/{chrome-backend-Y9F7W5VQ.mjs → chrome-backend-CORwaIyC.mjs} +1 -1
  7. package/dist/{chrome-backend-Y9F7W5VQ.mjs.map → chrome-backend-CORwaIyC.mjs.map} +1 -1
  8. package/dist/{chrome-backend-JEPeM2YE.mjs → chrome-backend-DIKYoWj-.mjs} +1 -1
  9. package/dist/cli.mjs +14 -15
  10. package/dist/cli.mjs.map +1 -1
  11. package/dist/frameworks/express.d.mts +1 -1
  12. package/dist/frameworks/express.mjs +3 -4
  13. package/dist/frameworks/express.mjs.map +1 -1
  14. package/dist/frameworks/fastify.d.mts +1 -1
  15. package/dist/frameworks/fastify.mjs +2 -3
  16. package/dist/frameworks/fastify.mjs.map +1 -1
  17. package/dist/frameworks/hono.d.mts +1 -1
  18. package/dist/frameworks/hono.mjs +2 -3
  19. package/dist/frameworks/hono.mjs.map +1 -1
  20. package/dist/frameworks/next.d.mts +2 -2
  21. package/dist/frameworks/next.mjs +2 -3
  22. package/dist/frameworks/next.mjs.map +1 -1
  23. package/dist/frameworks/react.d.mts +1 -1
  24. package/dist/frameworks/trpc.d.mts +1 -1
  25. package/dist/frameworks/trpc.mjs +2 -3
  26. package/dist/frameworks/trpc.mjs.map +1 -1
  27. package/dist/gerbil-DJGqq7BX.mjs +4 -0
  28. package/dist/{gerbil-yoSpRHgv.mjs → gerbil-DoDGHe6Z.mjs} +187 -19
  29. package/dist/gerbil-DoDGHe6Z.mjs.map +1 -0
  30. package/dist/{gerbil-POAz8peb.d.mts → gerbil-qOTe1nl2.d.mts} +2 -2
  31. package/dist/{gerbil-POAz8peb.d.mts.map → gerbil-qOTe1nl2.d.mts.map} +1 -1
  32. package/dist/index.d.mts +19 -3
  33. package/dist/index.d.mts.map +1 -1
  34. package/dist/index.mjs +6 -7
  35. package/dist/index.mjs.map +1 -1
  36. package/dist/integrations/ai-sdk.d.mts +1 -1
  37. package/dist/integrations/ai-sdk.mjs +4 -5
  38. package/dist/integrations/ai-sdk.mjs.map +1 -1
  39. package/dist/integrations/langchain.d.mts +1 -1
  40. package/dist/integrations/langchain.mjs +2 -3
  41. package/dist/integrations/langchain.mjs.map +1 -1
  42. package/dist/integrations/llamaindex.d.mts +1 -1
  43. package/dist/integrations/llamaindex.mjs +2 -3
  44. package/dist/integrations/llamaindex.mjs.map +1 -1
  45. package/dist/integrations/mcp-client.mjs +2 -2
  46. package/dist/integrations/mcp.d.mts +2 -2
  47. package/dist/integrations/mcp.mjs +5 -6
  48. package/dist/kokoro-BNTb6egA.mjs +20210 -0
  49. package/dist/kokoro-BNTb6egA.mjs.map +1 -0
  50. package/dist/{mcp-Bitg4sjX.mjs → mcp-kzDDWIoS.mjs} +3 -3
  51. package/dist/{mcp-Bitg4sjX.mjs.map → mcp-kzDDWIoS.mjs.map} +1 -1
  52. package/dist/{one-liner-B1rmFto6.mjs → one-liner-DxnNs_JK.mjs} +2 -2
  53. package/dist/{one-liner-B1rmFto6.mjs.map → one-liner-DxnNs_JK.mjs.map} +1 -1
  54. package/dist/repl-DGUw4fCc.mjs +9 -0
  55. package/dist/skills/index.d.mts +2 -2
  56. package/dist/skills/index.d.mts.map +1 -1
  57. package/dist/skills/index.mjs +4 -5
  58. package/dist/{skills-5DxAV-rn.mjs → skills-DulrOPeP.mjs} +12 -12
  59. package/dist/skills-DulrOPeP.mjs.map +1 -0
  60. package/dist/stt-1WIefHwc.mjs +3 -0
  61. package/dist/{stt-Bv_dum-R.mjs → stt-CG_7KB_0.mjs} +3 -2
  62. package/dist/stt-CG_7KB_0.mjs.map +1 -0
  63. package/dist/{tools-IYPrqoek.mjs → tools-Bi1P7Xoy.mjs} +2 -2
  64. package/dist/{tools-IYPrqoek.mjs.map → tools-Bi1P7Xoy.mjs.map} +1 -1
  65. package/dist/{tts-5yWeP_I0.mjs → tts-B1pZMlDv.mjs} +1 -1
  66. package/dist/{tts-DG6denWG.mjs → tts-CyHhcLtN.mjs} +6 -4
  67. package/dist/tts-CyHhcLtN.mjs.map +1 -0
  68. package/dist/{types-s6Py2_DL.d.mts → types-CiTc7ez3.d.mts} +1 -1
  69. package/dist/{types-s6Py2_DL.d.mts.map → types-CiTc7ez3.d.mts.map} +1 -1
  70. package/dist/{utils-CkB4Roi6.mjs → utils-CZBZ8dgR.mjs} +1 -1
  71. package/dist/{utils-CkB4Roi6.mjs.map → utils-CZBZ8dgR.mjs.map} +1 -1
  72. package/docs/architecture/overview.md +15 -7
  73. package/docs/tts.md +11 -8
  74. package/package.json +6 -6
  75. package/dist/browser/index.d.mts.map +0 -1
  76. package/dist/browser/index.mjs.map +0 -1
  77. package/dist/gerbil-DeQlX_Mt.mjs +0 -5
  78. package/dist/gerbil-yoSpRHgv.mjs.map +0 -1
  79. package/dist/models-BAtL8qsA.mjs +0 -171
  80. package/dist/models-BAtL8qsA.mjs.map +0 -1
  81. package/dist/models-CE0fBq0U.d.mts +0 -22
  82. package/dist/models-CE0fBq0U.d.mts.map +0 -1
  83. package/dist/repl-D20JO260.mjs +0 -10
  84. package/dist/skills-5DxAV-rn.mjs.map +0 -1
  85. package/dist/stt-Bv_dum-R.mjs.map +0 -1
  86. package/dist/stt-KzSoNvwI.mjs +0 -3
  87. package/dist/tts-DG6denWG.mjs.map +0 -1
  88. /package/dist/{auto-update-DsWBBnEk.mjs → auto-update-S9s5-g0C.mjs} +0 -0
  89. /package/dist/{chunk-Ct1HF2bE.mjs → chunk-CkXuGtQK.mjs} +0 -0
  90. /package/dist/{microphone-D-6y9aiE.mjs → microphone-DaMZFRuR.mjs} +0 -0
@@ -1,5 +1,115 @@
1
- import { o as resolveModel, t as BUILTIN_MODELS } from "../models-BAtL8qsA.mjs";
1
+ //#region src/core/models.ts
2
+ const BUILTIN_MODELS = {
3
+ "qwen3-0.6b": {
4
+ id: "qwen3-0.6b",
5
+ repo: "onnx-community/Qwen3-0.6B-ONNX",
6
+ description: "Qwen3 0.6B - Best balance of speed and quality, supports thinking",
7
+ size: "~400MB",
8
+ contextLength: 32768,
9
+ supportsThinking: true,
10
+ supportsJson: true,
11
+ family: "qwen"
12
+ },
13
+ "qwen2.5-0.5b": {
14
+ id: "qwen2.5-0.5b",
15
+ repo: "onnx-community/Qwen2.5-0.5B-Instruct",
16
+ description: "Qwen2.5 0.5B - Fast and capable",
17
+ size: "~350MB",
18
+ contextLength: 32768,
19
+ supportsThinking: false,
20
+ supportsJson: true,
21
+ family: "qwen"
22
+ },
23
+ "qwen2.5-coder-0.5b": {
24
+ id: "qwen2.5-coder-0.5b",
25
+ repo: "onnx-community/Qwen2.5-Coder-0.5B-Instruct",
26
+ description: "Qwen2.5 Coder 0.5B - Optimized for code",
27
+ size: "~400MB",
28
+ contextLength: 32768,
29
+ supportsThinking: false,
30
+ supportsJson: true,
31
+ family: "qwen"
32
+ },
33
+ "smollm2-360m": {
34
+ id: "smollm2-360m",
35
+ repo: "HuggingFaceTB/SmolLM2-360M-Instruct",
36
+ description: "SmolLM2 360M - Fast, good for simple tasks",
37
+ size: "~250MB",
38
+ contextLength: 8192,
39
+ supportsThinking: false,
40
+ supportsJson: false,
41
+ family: "smollm"
42
+ },
43
+ "smollm2-135m": {
44
+ id: "smollm2-135m",
45
+ repo: "HuggingFaceTB/SmolLM2-135M-Instruct",
46
+ description: "SmolLM2 135M - Fastest, basic generation",
47
+ size: "~100MB",
48
+ contextLength: 8192,
49
+ supportsThinking: false,
50
+ supportsJson: false,
51
+ family: "smollm"
52
+ },
53
+ "phi-3-mini": {
54
+ id: "phi-3-mini",
55
+ repo: "microsoft/Phi-3-mini-4k-instruct-onnx",
56
+ description: "Phi-3 Mini - High quality, larger model",
57
+ size: "~2.1GB",
58
+ contextLength: 4096,
59
+ supportsThinking: false,
60
+ supportsJson: true,
61
+ family: "phi"
62
+ },
63
+ "ministral-3b": {
64
+ id: "ministral-3b",
65
+ repo: "mistralai/Ministral-3-3B-Instruct-2512-ONNX",
66
+ description: "Ministral 3 3B - Vision + Reasoning, 256k context",
67
+ size: "~2.5GB",
68
+ contextLength: 262144,
69
+ supportsThinking: true,
70
+ supportsJson: true,
71
+ supportsVision: true,
72
+ visionEncoderSize: "0.4B",
73
+ family: "mistral"
74
+ }
75
+ };
76
+ /**
77
+ * Parse model identifier and resolve to source
78
+ *
79
+ * Supported formats:
80
+ * - "qwen3-0.6b" (built-in)
81
+ * - "hf:org/model" (HuggingFace shorthand)
82
+ * - "https://huggingface.co/org/model" (full URL)
83
+ * - "file:./path/to/model" (local path)
84
+ */
85
+ function resolveModel(modelId) {
86
+ if (BUILTIN_MODELS[modelId]) return {
87
+ type: "builtin",
88
+ path: BUILTIN_MODELS[modelId].repo
89
+ };
90
+ if (modelId.startsWith("hf:")) return {
91
+ type: "huggingface",
92
+ path: modelId.slice(3)
93
+ };
94
+ if (modelId.startsWith("https://huggingface.co/")) return {
95
+ type: "huggingface",
96
+ path: modelId.replace("https://huggingface.co/", "")
97
+ };
98
+ if (modelId.startsWith("file:")) return {
99
+ type: "local",
100
+ path: modelId.slice(5)
101
+ };
102
+ if (modelId.includes("/")) return {
103
+ type: "huggingface",
104
+ path: modelId
105
+ };
106
+ return {
107
+ type: "huggingface",
108
+ path: modelId
109
+ };
110
+ }
2
111
 
112
+ //#endregion
3
113
  //#region src/browser/index.ts
4
114
  /**
5
115
  * Gerbil Browser Support
@@ -1180,6 +1290,126 @@ const TTS_MODELS = {
1180
1290
  voices: SUPERTONIC_BROWSER_VOICES
1181
1291
  }
1182
1292
  };
1293
+ const TTS_WORKER_CODE = `
1294
+ // TTS Worker - runs in separate thread, loads from CDN
1295
+ import { pipeline, env } from "https://cdn.jsdelivr.net/npm/@huggingface/transformers@3.8.1";
1296
+
1297
+ // Configure environment
1298
+ env.useBrowserCache = true;
1299
+ env.allowLocalModels = false;
1300
+
1301
+ let ttsInstance = null;
1302
+ let modelType = null; // "supertonic" or "kokoro"
1303
+ let voiceEmbeddings = new Map();
1304
+ let kokoroTTS = null;
1305
+
1306
+ self.onmessage = async (e) => {
1307
+ const { type, payload } = e.data;
1308
+
1309
+ if (type === "load") {
1310
+ try {
1311
+ const { modelId, repo, voices } = payload;
1312
+ modelType = modelId === "supertonic-66m" ? "supertonic" : "kokoro";
1313
+
1314
+ if (modelType === "supertonic") {
1315
+ // Load Supertonic using transformers.js pipeline
1316
+ ttsInstance = await pipeline("text-to-speech", repo, {
1317
+ device: "webgpu",
1318
+ progress_callback: (progress) => {
1319
+ self.postMessage({ type: "progress", payload: progress });
1320
+ },
1321
+ });
1322
+
1323
+ // Load voice embeddings
1324
+ for (const voice of voices) {
1325
+ try {
1326
+ const voiceUrl = "https://huggingface.co/" + repo + "/resolve/main/voices/" + voice.id + ".bin";
1327
+ const response = await fetch(voiceUrl);
1328
+ if (response.ok) {
1329
+ const buffer = await response.arrayBuffer();
1330
+ voiceEmbeddings.set(voice.id, new Float32Array(buffer));
1331
+ }
1332
+ } catch (err) {
1333
+ console.warn("Failed to load voice:", voice.id, err);
1334
+ }
1335
+ }
1336
+
1337
+ // Warmup
1338
+ try {
1339
+ await ttsInstance("Hello", {
1340
+ speaker_embeddings: new Float32Array(1 * 101 * 128),
1341
+ num_inference_steps: 1,
1342
+ speed: 1.0,
1343
+ });
1344
+ } catch (e) {
1345
+ console.warn("Warmup failed:", e);
1346
+ }
1347
+ } else {
1348
+ // Load Kokoro using kokoro-js from CDN
1349
+ const kokoroModule = await import("https://cdn.jsdelivr.net/npm/kokoro-js@1.2.1/dist/kokoro.web.min.js");
1350
+ const { KokoroTTS } = kokoroModule;
1351
+
1352
+ kokoroTTS = await KokoroTTS.from_pretrained(repo, {
1353
+ dtype: "fp32",
1354
+ progress_callback: (progress) => {
1355
+ self.postMessage({ type: "progress", payload: progress });
1356
+ },
1357
+ });
1358
+ }
1359
+
1360
+ self.postMessage({ type: "ready" });
1361
+ } catch (err) {
1362
+ self.postMessage({ type: "error", payload: err.message || String(err) });
1363
+ }
1364
+ }
1365
+
1366
+ if (type === "generate") {
1367
+ try {
1368
+ const { text, voice, speed } = payload;
1369
+ let audio, sampleRate;
1370
+
1371
+ if (modelType === "supertonic") {
1372
+ let embedding = voiceEmbeddings.get(voice);
1373
+ if (!embedding) {
1374
+ embedding = new Float32Array(101 * 128).fill(0.1);
1375
+ }
1376
+
1377
+ const result = await ttsInstance(text, {
1378
+ speaker_embeddings: embedding,
1379
+ speed: speed || 1.0,
1380
+ });
1381
+
1382
+ audio = result.audio;
1383
+ sampleRate = result.sampling_rate;
1384
+ } else {
1385
+ const result = await kokoroTTS.generate(text, {
1386
+ voice: voice,
1387
+ speed: speed || 1.0,
1388
+ });
1389
+
1390
+ audio = result.audio;
1391
+ sampleRate = result.sampling_rate;
1392
+ }
1393
+
1394
+ // Transfer audio data back
1395
+ self.postMessage(
1396
+ { type: "audio", payload: { audio: audio, sampleRate: sampleRate } },
1397
+ [audio.buffer]
1398
+ );
1399
+ } catch (err) {
1400
+ self.postMessage({ type: "error", payload: err.message || String(err) });
1401
+ }
1402
+ }
1403
+ };
1404
+ `;
1405
+ /** Create TTS worker instance */
1406
+ function createTTSWorker() {
1407
+ const blob = new Blob([TTS_WORKER_CODE], { type: "application/javascript" });
1408
+ const url = URL.createObjectURL(blob);
1409
+ const worker = new Worker(url, { type: "module" });
1410
+ URL.revokeObjectURL(url);
1411
+ return worker;
1412
+ }
1183
1413
  /**
1184
1414
  * React hook for text-to-speech with Web Audio API playback
1185
1415
  *
@@ -1229,17 +1459,17 @@ function useSpeech(options = {}) {
1229
1459
  const [shouldLoad, setShouldLoad] = useState(autoLoad);
1230
1460
  const [currentVoice, setCurrentVoice] = useState(defaultVoice);
1231
1461
  const [currentSpeed, setCurrentSpeed] = useState(defaultSpeed);
1232
- const ttsRef = useRef(null);
1233
- const voiceEmbeddingsRef = useRef(/* @__PURE__ */ new Map());
1462
+ const workerRef = useRef(null);
1234
1463
  const audioContextRef = useRef(null);
1235
1464
  const sourceNodeRef = useRef(null);
1236
1465
  const mountedRef = useRef(true);
1237
1466
  const modelIdRef = useRef(modelId);
1467
+ const pendingSpeakRef = useRef(null);
1238
1468
  const listVoices = useCallback(() => {
1239
1469
  return modelConfig.voices;
1240
1470
  }, [modelConfig.voices]);
1241
1471
  const load = useCallback(() => {
1242
- if (ttsRef.current || isLoading) return;
1472
+ if (workerRef.current || isLoading) return;
1243
1473
  setIsLoading(true);
1244
1474
  setShouldLoad(true);
1245
1475
  }, [isLoading]);
@@ -1247,86 +1477,48 @@ function useSpeech(options = {}) {
1247
1477
  if (!shouldLoad) return;
1248
1478
  mountedRef.current = true;
1249
1479
  modelIdRef.current = modelId;
1250
- const initTTS = async () => {
1251
- try {
1252
- const isSupertonic = modelId === "supertonic-66m";
1253
- const config = TTS_MODELS[modelId];
1254
- setLoadingProgress({
1255
- status: "loading",
1256
- message: `Loading ${isSupertonic ? "Supertonic" : "Kokoro"} TTS...`
1257
- });
1258
- if (isSupertonic) {
1259
- const { pipeline } = await import("@huggingface/transformers");
1260
- const tts = await pipeline("text-to-speech", config.repo, {
1261
- device: "webgpu",
1262
- progress_callback: (progress) => {
1263
- if (!mountedRef.current) return;
1264
- if (progress.status === "progress" && progress.file) setLoadingProgress({
1265
- status: "downloading",
1266
- file: progress.file,
1267
- progress: Math.round(progress.progress || 0)
1268
- });
1269
- }
1270
- });
1271
- if (!mountedRef.current) return;
1272
- const voicesUrl = `https://huggingface.co/${config.repo}/resolve/main/voices/`;
1273
- const embeddingsMap = /* @__PURE__ */ new Map();
1274
- await Promise.all(config.voices.map(async (voice) => {
1275
- try {
1276
- const response = await fetch(`${voicesUrl}${voice.id}.bin`);
1277
- if (response.ok) {
1278
- const buffer = await response.arrayBuffer();
1279
- embeddingsMap.set(voice.id, new Float32Array(buffer));
1280
- }
1281
- } catch (e) {
1282
- console.warn(`Failed to load voice embedding for ${voice.id}:`, e);
1283
- }
1284
- }));
1285
- if (!mountedRef.current) return;
1286
- try {
1287
- await tts("Hello", {
1288
- speaker_embeddings: new Float32Array(12928),
1289
- num_inference_steps: 1,
1290
- speed: 1
1291
- });
1292
- } catch (e) {
1293
- console.warn("Supertonic warmup failed:", e);
1294
- }
1295
- voiceEmbeddingsRef.current = embeddingsMap;
1296
- ttsRef.current = {
1297
- type: "supertonic",
1298
- pipeline: tts,
1299
- config
1300
- };
1301
- } else {
1302
- const { KokoroTTS } = await import("kokoro-js");
1303
- const tts = await KokoroTTS.from_pretrained(config.repo, {
1304
- dtype: "fp32",
1305
- progress_callback: (progress) => {
1306
- if (!mountedRef.current) return;
1307
- if (progress.status === "progress" && progress.file) setLoadingProgress({
1308
- status: "downloading",
1309
- file: progress.file,
1310
- progress: Math.round(progress.progress || 0)
1311
- });
1312
- }
1313
- });
1314
- if (!mountedRef.current) return;
1315
- ttsRef.current = {
1316
- type: "kokoro",
1317
- instance: tts,
1318
- config
1319
- };
1320
- }
1480
+ const config = TTS_MODELS[modelId];
1481
+ setLoadingProgress({
1482
+ status: "loading",
1483
+ message: `Loading ${modelId === "supertonic-66m" ? "Supertonic" : "Kokoro"} TTS...`
1484
+ });
1485
+ const worker = createTTSWorker();
1486
+ workerRef.current = worker;
1487
+ worker.onmessage = (e) => {
1488
+ if (!mountedRef.current) return;
1489
+ const { type, payload } = e.data;
1490
+ if (type === "progress" && payload.status === "progress" && payload.file) setLoadingProgress({
1491
+ status: "downloading",
1492
+ file: payload.file,
1493
+ progress: Math.round(payload.progress || 0)
1494
+ });
1495
+ if (type === "ready") {
1321
1496
  setIsLoading(false);
1322
1497
  setIsReady(true);
1323
1498
  setLoadingProgress({ status: "ready" });
1324
1499
  onReady?.();
1325
- } catch (err) {
1326
- if (!mountedRef.current) return;
1327
- const errorMsg = err instanceof Error ? err.message : String(err);
1500
+ if (pendingSpeakRef.current) {
1501
+ const { text, voice, speed } = pendingSpeakRef.current;
1502
+ pendingSpeakRef.current = null;
1503
+ worker.postMessage({
1504
+ type: "generate",
1505
+ payload: {
1506
+ text,
1507
+ voice,
1508
+ speed
1509
+ }
1510
+ });
1511
+ }
1512
+ }
1513
+ if (type === "audio") {
1514
+ const { audio, sampleRate } = payload;
1515
+ playAudioData(audio, sampleRate);
1516
+ }
1517
+ if (type === "error") {
1518
+ const errorMsg = payload;
1328
1519
  setError(errorMsg);
1329
1520
  setIsLoading(false);
1521
+ setIsSpeaking(false);
1330
1522
  setLoadingProgress({
1331
1523
  status: "error",
1332
1524
  error: errorMsg
@@ -1334,9 +1526,29 @@ function useSpeech(options = {}) {
1334
1526
  onError?.(errorMsg);
1335
1527
  }
1336
1528
  };
1337
- initTTS();
1529
+ worker.onerror = (err) => {
1530
+ if (!mountedRef.current) return;
1531
+ const errorMsg = err.message || "Worker error";
1532
+ setError(errorMsg);
1533
+ setIsLoading(false);
1534
+ setLoadingProgress({
1535
+ status: "error",
1536
+ error: errorMsg
1537
+ });
1538
+ onError?.(errorMsg);
1539
+ };
1540
+ worker.postMessage({
1541
+ type: "load",
1542
+ payload: {
1543
+ modelId,
1544
+ repo: config.repo,
1545
+ voices: config.voices
1546
+ }
1547
+ });
1338
1548
  return () => {
1339
1549
  mountedRef.current = false;
1550
+ worker.terminate();
1551
+ workerRef.current = null;
1340
1552
  };
1341
1553
  }, [
1342
1554
  shouldLoad,
@@ -1344,6 +1556,30 @@ function useSpeech(options = {}) {
1344
1556
  onReady,
1345
1557
  onError
1346
1558
  ]);
1559
+ const playAudioData = useCallback(async (audio, sampleRate) => {
1560
+ try {
1561
+ if (!audioContextRef.current || audioContextRef.current.state === "closed") audioContextRef.current = new AudioContext({ sampleRate });
1562
+ const ctx = audioContextRef.current;
1563
+ if (ctx.state === "suspended") await ctx.resume();
1564
+ const audioBuffer = ctx.createBuffer(1, audio.length, sampleRate);
1565
+ audioBuffer.copyToChannel(new Float32Array(audio), 0);
1566
+ const sourceNode = ctx.createBufferSource();
1567
+ sourceNode.buffer = audioBuffer;
1568
+ sourceNode.connect(ctx.destination);
1569
+ sourceNodeRef.current = sourceNode;
1570
+ sourceNode.onended = () => {
1571
+ if (!mountedRef.current) return;
1572
+ setIsSpeaking(false);
1573
+ onEnd?.();
1574
+ };
1575
+ sourceNode.start();
1576
+ } catch (err) {
1577
+ setIsSpeaking(false);
1578
+ const errorMsg = err instanceof Error ? err.message : String(err);
1579
+ setError(errorMsg);
1580
+ onError?.(errorMsg);
1581
+ }
1582
+ }, [onEnd, onError]);
1347
1583
  useEffect(() => {
1348
1584
  return () => {
1349
1585
  try {
@@ -1358,89 +1594,46 @@ function useSpeech(options = {}) {
1358
1594
  speak: useCallback(async (text, opts) => {
1359
1595
  const voice = opts?.voice || currentVoice;
1360
1596
  const speed = opts?.speed || currentSpeed;
1361
- if (!ttsRef.current) {
1597
+ if (!modelConfig.voices.find((v) => v.id === voice)) {
1598
+ const errorMsg = `Voice "${voice}" not found. Should be one of: ${modelConfig.voices.map((v) => v.id).join(", ")}.`;
1599
+ setError(errorMsg);
1600
+ onError?.(errorMsg);
1601
+ return;
1602
+ }
1603
+ if (!workerRef.current) {
1604
+ pendingSpeakRef.current = {
1605
+ text,
1606
+ voice,
1607
+ speed
1608
+ };
1362
1609
  load();
1363
1610
  return;
1364
1611
  }
1365
- try {
1366
- setIsSpeaking(true);
1367
- onStart?.();
1368
- let audioData;
1369
- let sampleRate;
1370
- const ttsBackend = ttsRef.current;
1371
- if (ttsBackend.type === "supertonic") {
1372
- const config = ttsBackend.config;
1373
- if (!config.voices.find((v) => v.id === voice)) {
1374
- const validVoices = config.voices.map((v) => v.id).join(", ");
1375
- throw new Error(`Voice "${voice}" not found. Should be one of: ${validVoices}.`);
1376
- }
1377
- let speakerEmbedding = voiceEmbeddingsRef.current.get(voice);
1378
- if (!speakerEmbedding) try {
1379
- const voiceUrl = `https://huggingface.co/${config.repo}/resolve/main/voices/${voice}.bin`;
1380
- const response = await fetch(voiceUrl);
1381
- if (response.ok) {
1382
- const buffer = await response.arrayBuffer();
1383
- speakerEmbedding = new Float32Array(buffer);
1384
- voiceEmbeddingsRef.current.set(voice, speakerEmbedding);
1385
- } else throw new Error(`Failed to load voice: ${response.status}`);
1386
- } catch {
1387
- speakerEmbedding = new Float32Array(12928).fill(.1);
1388
- voiceEmbeddingsRef.current.set(voice, speakerEmbedding);
1389
- }
1390
- const result = await ttsBackend.pipeline(text, {
1391
- speaker_embeddings: speakerEmbedding,
1392
- speed
1393
- });
1394
- audioData = result.audio;
1395
- sampleRate = result.sampling_rate;
1396
- } else {
1397
- const config = ttsBackend.config;
1398
- if (!config.voices.find((v) => v.id === voice)) {
1399
- const validVoices = config.voices.map((v) => v.id).join(", ");
1400
- throw new Error(`Voice "${voice}" not found. Should be one of: ${validVoices}.`);
1401
- }
1402
- const result = await ttsBackend.instance.generate(text, {
1403
- voice,
1404
- speed
1405
- });
1406
- audioData = result.audio;
1407
- sampleRate = result.sampling_rate;
1408
- }
1409
- if (!mountedRef.current) return;
1410
- if (!audioContextRef.current || audioContextRef.current.state === "closed") audioContextRef.current = new AudioContext();
1411
- const audioContext = audioContextRef.current;
1412
- if (audioContext.state === "suspended") await audioContext.resume();
1413
- const audioBuffer = audioContext.createBuffer(1, audioData.length, sampleRate);
1414
- const channelData = new Float32Array(audioData);
1415
- audioBuffer.copyToChannel(channelData, 0);
1416
- if (sourceNodeRef.current) {
1417
- sourceNodeRef.current.stop();
1418
- sourceNodeRef.current.disconnect();
1419
- }
1420
- const sourceNode = audioContext.createBufferSource();
1421
- sourceNode.buffer = audioBuffer;
1422
- sourceNode.connect(audioContext.destination);
1423
- sourceNode.onended = () => {
1424
- if (mountedRef.current) {
1425
- setIsSpeaking(false);
1426
- onEnd?.();
1427
- }
1612
+ if (!isReady) {
1613
+ pendingSpeakRef.current = {
1614
+ text,
1615
+ voice,
1616
+ speed
1428
1617
  };
1429
- sourceNodeRef.current = sourceNode;
1430
- sourceNode.start();
1431
- } catch (err) {
1432
- if (!mountedRef.current) return;
1433
- const errorMsg = err instanceof Error ? err.message : String(err);
1434
- setError(errorMsg);
1435
- setIsSpeaking(false);
1436
- onError?.(errorMsg);
1618
+ return;
1437
1619
  }
1620
+ setIsSpeaking(true);
1621
+ onStart?.();
1622
+ workerRef.current.postMessage({
1623
+ type: "generate",
1624
+ payload: {
1625
+ text,
1626
+ voice,
1627
+ speed
1628
+ }
1629
+ });
1438
1630
  }, [
1439
1631
  currentVoice,
1440
1632
  currentSpeed,
1633
+ modelConfig.voices,
1441
1634
  load,
1635
+ isReady,
1442
1636
  onStart,
1443
- onEnd,
1444
1637
  onError
1445
1638
  ]),
1446
1639
  stop: useCallback(() => {
@@ -1563,6 +1756,61 @@ function createAudioPlayer(sampleRate = 24e3) {
1563
1756
  isPlaying: () => isActive
1564
1757
  };
1565
1758
  }
1759
+ const STT_WORKER_CODE = `
1760
+ // STT Worker - runs in separate thread, loads from CDN
1761
+ import { pipeline, env } from "https://cdn.jsdelivr.net/npm/@huggingface/transformers@3.8.1";
1762
+
1763
+ // Configure environment
1764
+ env.useBrowserCache = true;
1765
+ env.allowLocalModels = false;
1766
+
1767
+ let sttPipeline = null;
1768
+
1769
+ self.onmessage = async (e) => {
1770
+ const { type, payload } = e.data;
1771
+
1772
+ if (type === "load") {
1773
+ try {
1774
+ const { model } = payload;
1775
+
1776
+ // Load Whisper model
1777
+ sttPipeline = await pipeline("automatic-speech-recognition", model, {
1778
+ device: "webgpu",
1779
+ progress_callback: (progress) => {
1780
+ self.postMessage({ type: "progress", payload: progress });
1781
+ },
1782
+ });
1783
+
1784
+ self.postMessage({ type: "ready" });
1785
+ } catch (err) {
1786
+ self.postMessage({ type: "error", payload: err.message || String(err) });
1787
+ }
1788
+ }
1789
+
1790
+ if (type === "transcribe") {
1791
+ try {
1792
+ const { audio } = payload;
1793
+
1794
+ // Run transcription
1795
+ const result = await sttPipeline(audio, {
1796
+ return_timestamps: false,
1797
+ });
1798
+
1799
+ self.postMessage({ type: "transcript", payload: result.text || "" });
1800
+ } catch (err) {
1801
+ self.postMessage({ type: "error", payload: err.message || String(err) });
1802
+ }
1803
+ }
1804
+ };
1805
+ `;
1806
+ /** Create STT worker instance */
1807
+ function createSTTWorker() {
1808
+ const blob = new Blob([STT_WORKER_CODE], { type: "application/javascript" });
1809
+ const url = URL.createObjectURL(blob);
1810
+ const worker = new Worker(url, { type: "module" });
1811
+ URL.revokeObjectURL(url);
1812
+ return worker;
1813
+ }
1566
1814
  /**
1567
1815
  * React hook for voice input with browser microphone
1568
1816
  *
@@ -1620,7 +1868,7 @@ function useVoiceInput(options = {}) {
1620
1868
  const [chunkCount, setChunkCount] = useState(0);
1621
1869
  const [error, setError] = useState(null);
1622
1870
  const [shouldLoad, setShouldLoad] = useState(autoLoad);
1623
- const sttRef = useRef(null);
1871
+ const workerRef = useRef(null);
1624
1872
  const mediaRecorderRef = useRef(null);
1625
1873
  const audioChunksRef = useRef([]);
1626
1874
  const streamRef = useRef(null);
@@ -1628,49 +1876,66 @@ function useVoiceInput(options = {}) {
1628
1876
  const streamingIntervalRef = useRef(null);
1629
1877
  const pendingChunksRef = useRef([]);
1630
1878
  const fullTranscriptRef = useRef("");
1879
+ const transcribeResolveRef = useRef(null);
1880
+ const transcribeRejectRef = useRef(null);
1881
+ const resolveSTTModel = (modelId) => {
1882
+ return {
1883
+ "whisper-tiny": "onnx-community/whisper-tiny",
1884
+ "whisper-tiny.en": "onnx-community/whisper-tiny.en",
1885
+ "whisper-base": "onnx-community/whisper-base",
1886
+ "whisper-base.en": "onnx-community/whisper-base.en",
1887
+ "whisper-small": "onnx-community/whisper-small",
1888
+ "whisper-small.en": "onnx-community/whisper-small.en"
1889
+ }[modelId] || modelId;
1890
+ };
1631
1891
  useEffect(() => {
1632
1892
  if (!shouldLoad || isReady) return;
1633
- let cancelled = false;
1634
- const loadModel = async () => {
1635
- try {
1636
- setIsLoading(true);
1637
- setLoadingProgress({
1638
- status: "loading",
1639
- message: "Loading STT model..."
1640
- });
1641
- onProgress?.({
1642
- status: "loading",
1643
- message: "Loading STT model..."
1644
- });
1645
- const { WhisperSTT } = await import("../stt-KzSoNvwI.mjs");
1646
- if (cancelled || !mountedRef.current) return;
1647
- const stt = new WhisperSTT(model);
1648
- await stt.load({ onProgress: (p) => {
1649
- if (!mountedRef.current) return;
1650
- const progress = {
1651
- status: p.progress !== void 0 ? "downloading" : "loading",
1652
- message: p.status,
1653
- progress: p.progress,
1654
- file: p.file
1655
- };
1656
- setLoadingProgress(progress);
1657
- onProgress?.(progress);
1658
- } });
1659
- if (cancelled || !mountedRef.current) {
1660
- stt.dispose();
1661
- return;
1662
- }
1663
- sttRef.current = stt;
1893
+ mountedRef.current = true;
1894
+ setIsLoading(true);
1895
+ setLoadingProgress({
1896
+ status: "loading",
1897
+ message: "Loading STT model..."
1898
+ });
1899
+ onProgress?.({
1900
+ status: "loading",
1901
+ message: "Loading STT model..."
1902
+ });
1903
+ const worker = createSTTWorker();
1904
+ workerRef.current = worker;
1905
+ worker.onmessage = (e) => {
1906
+ if (!mountedRef.current) return;
1907
+ const { type, payload } = e.data;
1908
+ if (type === "progress") {
1909
+ const progress = {
1910
+ status: payload.progress !== void 0 ? "downloading" : "loading",
1911
+ message: payload.status,
1912
+ progress: payload.progress,
1913
+ file: payload.file
1914
+ };
1915
+ setLoadingProgress(progress);
1916
+ onProgress?.(progress);
1917
+ }
1918
+ if (type === "ready") {
1664
1919
  setIsReady(true);
1665
1920
  setIsLoading(false);
1666
1921
  setLoadingProgress({ status: "ready" });
1667
1922
  onProgress?.({ status: "ready" });
1668
1923
  onReady?.();
1669
- } catch (e) {
1670
- if (!mountedRef.current) return;
1671
- const errMsg = e.message || "Failed to load STT model";
1924
+ }
1925
+ if (type === "transcript") {
1926
+ const text = payload;
1927
+ setIsTranscribing(false);
1928
+ if (transcribeResolveRef.current) {
1929
+ transcribeResolveRef.current(text);
1930
+ transcribeResolveRef.current = null;
1931
+ transcribeRejectRef.current = null;
1932
+ }
1933
+ }
1934
+ if (type === "error") {
1935
+ const errMsg = payload;
1672
1936
  setError(errMsg);
1673
1937
  setIsLoading(false);
1938
+ setIsTranscribing(false);
1674
1939
  setLoadingProgress({
1675
1940
  status: "error",
1676
1941
  message: errMsg
@@ -1680,11 +1945,36 @@ function useVoiceInput(options = {}) {
1680
1945
  message: errMsg
1681
1946
  });
1682
1947
  onError?.(errMsg);
1948
+ if (transcribeRejectRef.current) {
1949
+ transcribeRejectRef.current(new Error(errMsg));
1950
+ transcribeResolveRef.current = null;
1951
+ transcribeRejectRef.current = null;
1952
+ }
1683
1953
  }
1684
1954
  };
1685
- loadModel();
1955
+ worker.onerror = (err) => {
1956
+ if (!mountedRef.current) return;
1957
+ const errMsg = err.message || "Worker error";
1958
+ setError(errMsg);
1959
+ setIsLoading(false);
1960
+ setLoadingProgress({
1961
+ status: "error",
1962
+ message: errMsg
1963
+ });
1964
+ onProgress?.({
1965
+ status: "error",
1966
+ message: errMsg
1967
+ });
1968
+ onError?.(errMsg);
1969
+ };
1970
+ worker.postMessage({
1971
+ type: "load",
1972
+ payload: { model: resolveSTTModel(model) }
1973
+ });
1686
1974
  return () => {
1687
- cancelled = true;
1975
+ mountedRef.current = false;
1976
+ worker.terminate();
1977
+ workerRef.current = null;
1688
1978
  };
1689
1979
  }, [
1690
1980
  shouldLoad,
@@ -1698,7 +1988,10 @@ function useVoiceInput(options = {}) {
1698
1988
  mountedRef.current = true;
1699
1989
  return () => {
1700
1990
  mountedRef.current = false;
1701
- if (sttRef.current) sttRef.current.dispose();
1991
+ if (workerRef.current) {
1992
+ workerRef.current.terminate();
1993
+ workerRef.current = null;
1994
+ }
1702
1995
  if (streamRef.current) for (const track of streamRef.current.getTracks()) track.stop();
1703
1996
  };
1704
1997
  }, []);
@@ -1732,27 +2025,38 @@ function useVoiceInput(options = {}) {
1732
2025
  return new Float32Array(channelData);
1733
2026
  }, []);
1734
2027
  const transcribe = useCallback(async (audio) => {
1735
- if (!sttRef.current) {
2028
+ if (!workerRef.current) {
1736
2029
  if (!shouldLoad) {
1737
2030
  setShouldLoad(true);
1738
2031
  throw new Error("STT model not loaded. Loading now, please try again.");
1739
2032
  }
1740
2033
  throw new Error("STT model not loaded");
1741
2034
  }
2035
+ if (!isReady) throw new Error("STT model still loading");
1742
2036
  setIsTranscribing(true);
1743
- try {
1744
- let text = (await sttRef.current.transcribe(audio)).text.trim();
1745
- if (text === "[BLANK_AUDIO]" || text === "(blank audio)" || text === "[BLANK AUDIO]") text = "";
1746
- setTranscript(text);
1747
- onTranscript?.(text);
1748
- return text;
1749
- } finally {
1750
- if (mountedRef.current) setIsTranscribing(false);
1751
- }
1752
- }, [shouldLoad, onTranscript]);
2037
+ return new Promise((resolve, reject) => {
2038
+ transcribeResolveRef.current = (text) => {
2039
+ let filtered = text.trim();
2040
+ if (filtered === "[BLANK_AUDIO]" || filtered === "(blank audio)" || filtered === "[BLANK AUDIO]") filtered = "";
2041
+ setTranscript(filtered);
2042
+ onTranscript?.(filtered);
2043
+ resolve(filtered);
2044
+ };
2045
+ transcribeRejectRef.current = reject;
2046
+ const audioArray = new Float32Array(audio);
2047
+ workerRef.current.postMessage({
2048
+ type: "transcribe",
2049
+ payload: { audio: audioArray }
2050
+ }, [audioArray.buffer]);
2051
+ });
2052
+ }, [
2053
+ shouldLoad,
2054
+ isReady,
2055
+ onTranscript
2056
+ ]);
1753
2057
  const processedSamplesRef = useRef(0);
1754
2058
  const transcribeChunk = useCallback(async (chunkIdx) => {
1755
- if (!sttRef.current || audioChunksRef.current.length === 0) return "";
2059
+ if (!workerRef.current || !isReady || audioChunksRef.current.length === 0) return "";
1756
2060
  try {
1757
2061
  const audioData = await blobToFloat32(new Blob(audioChunksRef.current, { type: "audio/webm" }));
1758
2062
  const newSamplesStart = processedSamplesRef.current;
@@ -1760,8 +2064,7 @@ function useVoiceInput(options = {}) {
1760
2064
  if (totalSamples - newSamplesStart < 8e3) return "";
1761
2065
  const newAudio = audioData.slice(newSamplesStart);
1762
2066
  processedSamplesRef.current = totalSamples;
1763
- let text = (await sttRef.current.transcribe(newAudio)).text.trim();
1764
- if (text === "[BLANK_AUDIO]" || text === "(blank audio)" || text === "[BLANK AUDIO]") text = "";
2067
+ const text = await transcribe(newAudio);
1765
2068
  if (text && mountedRef.current) {
1766
2069
  setStreamingChunk(text);
1767
2070
  onChunk?.(text, chunkIdx);
@@ -1770,38 +2073,30 @@ function useVoiceInput(options = {}) {
1770
2073
  } catch {
1771
2074
  return "";
1772
2075
  }
1773
- }, [blobToFloat32, onChunk]);
2076
+ }, [
2077
+ blobToFloat32,
2078
+ isReady,
2079
+ transcribe,
2080
+ onChunk
2081
+ ]);
1774
2082
  return {
1775
2083
  startRecording: useCallback(async () => {
1776
2084
  if (isRecording) return;
1777
2085
  try {
1778
- if (streaming && !sttRef.current) {
2086
+ if (streaming && !isReady) {
1779
2087
  if (!shouldLoad) setShouldLoad(true);
1780
- setIsLoading(true);
1781
- const { WhisperSTT } = await import("../stt-KzSoNvwI.mjs");
1782
- const stt = new WhisperSTT(model);
1783
- await stt.load({ onProgress: (p) => {
1784
- if (mountedRef.current) {
1785
- const progress = {
1786
- status: p.status === "downloading" ? "downloading" : p.status === "ready" ? "ready" : "loading",
1787
- message: p.status,
1788
- progress: p.progress,
1789
- file: p.file
1790
- };
1791
- setLoadingProgress(progress);
1792
- onProgress?.(progress);
1793
- }
1794
- } });
1795
- if (!mountedRef.current) {
1796
- stt.dispose();
1797
- return;
1798
- }
1799
- sttRef.current = stt;
1800
- setIsReady(true);
1801
- setIsLoading(false);
1802
- setLoadingProgress({ status: "ready" });
1803
- onProgress?.({ status: "ready" });
1804
- onReady?.();
2088
+ await new Promise((resolve, reject) => {
2089
+ const checkReady = setInterval(() => {
2090
+ if (isReady && workerRef.current) {
2091
+ clearInterval(checkReady);
2092
+ resolve();
2093
+ }
2094
+ }, 100);
2095
+ setTimeout(() => {
2096
+ clearInterval(checkReady);
2097
+ reject(/* @__PURE__ */ new Error("Timeout waiting for STT model"));
2098
+ }, 6e4);
2099
+ });
1805
2100
  }
1806
2101
  const stream = await navigator.mediaDevices.getUserMedia({ audio: {
1807
2102
  sampleRate: 16e3,
@@ -1828,7 +2123,7 @@ function useVoiceInput(options = {}) {
1828
2123
  mediaRecorder.start(100);
1829
2124
  setIsRecording(true);
1830
2125
  setError(null);
1831
- if (streaming && sttRef.current) {
2126
+ if (streaming && isReady && workerRef.current) {
1832
2127
  let chunkIdx = 0;
1833
2128
  let shouldContinue = true;
1834
2129
  const processNextChunk = async () => {
@@ -1918,11 +2213,11 @@ function useVoiceInput(options = {}) {
1918
2213
  }
1919
2214
  const audioBlob = new Blob(audioChunksRef.current, { type: "audio/webm" });
1920
2215
  try {
1921
- if (!sttRef.current) {
2216
+ if (!isReady || !workerRef.current) {
1922
2217
  if (!shouldLoad) setShouldLoad(true);
1923
2218
  await new Promise((res, rej) => {
1924
2219
  const checkReady = setInterval(() => {
1925
- if (sttRef.current) {
2220
+ if (isReady && workerRef.current) {
1926
2221
  clearInterval(checkReady);
1927
2222
  res();
1928
2223
  }
@@ -2053,6 +2348,16 @@ function useVoiceChat(options = {}) {
2053
2348
  const isListening = stage === "listening";
2054
2349
  const isProcessing = stage === "transcribing" || stage === "thinking";
2055
2350
  const isSpeaking = stage === "speaking";
2351
+ const resolveSTTModel = (modelId) => {
2352
+ return {
2353
+ "whisper-tiny": "onnx-community/whisper-tiny",
2354
+ "whisper-tiny.en": "onnx-community/whisper-tiny.en",
2355
+ "whisper-base": "onnx-community/whisper-base",
2356
+ "whisper-base.en": "onnx-community/whisper-base.en",
2357
+ "whisper-small": "onnx-community/whisper-small",
2358
+ "whisper-small.en": "onnx-community/whisper-small.en"
2359
+ }[modelId] || modelId;
2360
+ };
2056
2361
  useEffect(() => {
2057
2362
  if (!shouldLoad || isReady) return;
2058
2363
  let cancelled = false;
@@ -2061,18 +2366,29 @@ function useVoiceChat(options = {}) {
2061
2366
  setIsLoading(true);
2062
2367
  setError(null);
2063
2368
  setLoadingMessage("Loading speech recognition (Whisper)...");
2064
- const { WhisperSTT } = await import("../stt-KzSoNvwI.mjs");
2065
- if (cancelled || !mountedRef.current) return;
2066
- const stt = new WhisperSTT(sttModel);
2067
- await stt.load({ onProgress: (p) => {
2068
- if (!mountedRef.current) return;
2069
- setLoadingMessage(p.status || "Loading STT...");
2070
- } });
2369
+ const sttWorker = createSTTWorker();
2071
2370
  if (cancelled || !mountedRef.current) {
2072
- stt.dispose();
2371
+ sttWorker.terminate();
2073
2372
  return;
2074
2373
  }
2075
- sttRef.current = stt;
2374
+ await new Promise((resolve, reject) => {
2375
+ sttWorker.onmessage = (e) => {
2376
+ const { type, payload } = e.data;
2377
+ if (type === "ready") resolve();
2378
+ if (type === "error") reject(new Error(payload));
2379
+ if (type === "progress" && mountedRef.current) setLoadingMessage(payload.status || "Loading STT...");
2380
+ };
2381
+ sttWorker.onerror = (e) => reject(new Error(e.message));
2382
+ sttWorker.postMessage({
2383
+ type: "load",
2384
+ payload: { model: resolveSTTModel(sttModel) }
2385
+ });
2386
+ });
2387
+ if (cancelled || !mountedRef.current) {
2388
+ sttWorker.terminate();
2389
+ return;
2390
+ }
2391
+ sttRef.current = sttWorker;
2076
2392
  setLoadingMessage("Loading language model...");
2077
2393
  const worker = await createGerbilWorker({
2078
2394
  modelId: llmModel,
@@ -2087,18 +2403,34 @@ function useVoiceChat(options = {}) {
2087
2403
  }
2088
2404
  llmWorkerRef.current = worker;
2089
2405
  setLoadingMessage(`Loading text-to-speech (${ttsModelId === "supertonic-66m" ? "Supertonic" : "Kokoro"})...`);
2090
- const { createTTS } = await import("../tts-5yWeP_I0.mjs");
2091
- if (cancelled || !mountedRef.current) return;
2092
- const tts = createTTS(ttsModelId);
2093
- await tts.load({ onProgress: (p) => {
2094
- if (!mountedRef.current) return;
2095
- setLoadingMessage(p.status || "Loading TTS...");
2096
- } });
2406
+ const ttsWorker = createTTSWorker();
2407
+ if (cancelled || !mountedRef.current) {
2408
+ ttsWorker.terminate();
2409
+ return;
2410
+ }
2411
+ const ttsConfig$1 = TTS_MODELS[ttsModelId];
2412
+ await new Promise((resolve, reject) => {
2413
+ ttsWorker.onmessage = (e) => {
2414
+ const { type, payload } = e.data;
2415
+ if (type === "ready") resolve();
2416
+ if (type === "error") reject(new Error(payload));
2417
+ if (type === "progress" && mountedRef.current) setLoadingMessage(payload.status || "Loading TTS...");
2418
+ };
2419
+ ttsWorker.onerror = (e) => reject(new Error(e.message));
2420
+ ttsWorker.postMessage({
2421
+ type: "load",
2422
+ payload: {
2423
+ modelId: ttsModelId,
2424
+ repo: ttsConfig$1.repo,
2425
+ voices: ttsConfig$1.voices
2426
+ }
2427
+ });
2428
+ });
2097
2429
  if (cancelled || !mountedRef.current) {
2098
- await tts.dispose();
2430
+ ttsWorker.terminate();
2099
2431
  return;
2100
2432
  }
2101
- ttsRef.current = tts;
2433
+ ttsRef.current = ttsWorker;
2102
2434
  setIsReady(true);
2103
2435
  setIsLoading(false);
2104
2436
  setLoadingMessage("Ready!");
@@ -2127,8 +2459,8 @@ function useVoiceChat(options = {}) {
2127
2459
  return () => {
2128
2460
  mountedRef.current = false;
2129
2461
  llmWorkerRef.current?.terminate();
2130
- sttRef.current?.dispose();
2131
- ttsRef.current?.dispose();
2462
+ sttRef.current?.terminate();
2463
+ ttsRef.current?.terminate();
2132
2464
  if (streamRef.current) for (const track of streamRef.current.getTracks()) track.stop();
2133
2465
  audioContextRef.current?.close();
2134
2466
  };
@@ -2234,7 +2566,26 @@ function useVoiceChat(options = {}) {
2234
2566
  try {
2235
2567
  setStage("transcribing");
2236
2568
  const audioData = await blobToFloat32(audioBlob);
2237
- let userText = (await sttRef.current.transcribe(audioData)).text.trim();
2569
+ let userText = await new Promise((sttResolve, sttReject) => {
2570
+ const handler = (e) => {
2571
+ const { type, payload } = e.data;
2572
+ if (type === "transcript") {
2573
+ sttRef.current?.removeEventListener("message", handler);
2574
+ sttResolve(payload);
2575
+ }
2576
+ if (type === "error") {
2577
+ sttRef.current?.removeEventListener("message", handler);
2578
+ sttReject(new Error(payload));
2579
+ }
2580
+ };
2581
+ sttRef.current?.addEventListener("message", handler);
2582
+ const audioArray = new Float32Array(audioData);
2583
+ sttRef.current?.postMessage({
2584
+ type: "transcribe",
2585
+ payload: { audio: audioArray }
2586
+ }, [audioArray.buffer]);
2587
+ });
2588
+ userText = userText.trim();
2238
2589
  if (userText === "[BLANK_AUDIO]" || userText === "(blank audio)" || userText === "[BLANK AUDIO]") userText = "";
2239
2590
  if (cancelledRef.current || !userText) {
2240
2591
  setStage("idle");
@@ -2284,9 +2635,30 @@ function useVoiceChat(options = {}) {
2284
2635
  onAssistantSpeak?.(responseText);
2285
2636
  if (responseText.trim()) {
2286
2637
  setStage("speaking");
2287
- const ttsResult = await ttsRef.current.speak(responseText, {
2288
- voice,
2289
- speed
2638
+ const ttsResult = await new Promise((ttsResolve, ttsReject) => {
2639
+ const handler = (e) => {
2640
+ const { type, payload } = e.data;
2641
+ if (type === "audio") {
2642
+ ttsRef.current?.removeEventListener("message", handler);
2643
+ ttsResolve({
2644
+ audio: payload.audio,
2645
+ sampleRate: payload.sampleRate
2646
+ });
2647
+ }
2648
+ if (type === "error") {
2649
+ ttsRef.current?.removeEventListener("message", handler);
2650
+ ttsReject(new Error(payload));
2651
+ }
2652
+ };
2653
+ ttsRef.current?.addEventListener("message", handler);
2654
+ ttsRef.current?.postMessage({
2655
+ type: "generate",
2656
+ payload: {
2657
+ text: responseText,
2658
+ voice,
2659
+ speed
2660
+ }
2661
+ });
2290
2662
  });
2291
2663
  if (!cancelledRef.current) await playAudioBuffer(ttsResult.audio, ttsResult.sampleRate);
2292
2664
  }
@@ -2378,4 +2750,4 @@ var browser_default = {
2378
2750
 
2379
2751
  //#endregion
2380
2752
  export { BUILTIN_MODELS, createAudioPlayer, createGerbilWorker, browser_default as default, getWebGPUInfo, isWebGPUSupported, playAudio, useChat, useCompletion, useSpeech, useVoiceChat, useVoiceInput };
2381
- //# sourceMappingURL=index.mjs.map
2753
+ //# sourceMappingURL=index.js.map