@tryhamster/gerbil 1.0.0-rc.10 → 1.0.0-rc.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1290,6 +1290,126 @@ const TTS_MODELS = {
1290
1290
  voices: SUPERTONIC_BROWSER_VOICES
1291
1291
  }
1292
1292
  };
1293
+ const TTS_WORKER_CODE = `
1294
+ // TTS Worker - runs in separate thread, loads from CDN
1295
+ import { pipeline, env } from "https://cdn.jsdelivr.net/npm/@huggingface/transformers@3.8.1";
1296
+
1297
+ // Configure environment
1298
+ env.useBrowserCache = true;
1299
+ env.allowLocalModels = false;
1300
+
1301
+ let ttsInstance = null;
1302
+ let modelType = null; // "supertonic" or "kokoro"
1303
+ let voiceEmbeddings = new Map();
1304
+ let kokoroTTS = null;
1305
+
1306
+ self.onmessage = async (e) => {
1307
+ const { type, payload } = e.data;
1308
+
1309
+ if (type === "load") {
1310
+ try {
1311
+ const { modelId, repo, voices } = payload;
1312
+ modelType = modelId === "supertonic-66m" ? "supertonic" : "kokoro";
1313
+
1314
+ if (modelType === "supertonic") {
1315
+ // Load Supertonic using transformers.js pipeline
1316
+ ttsInstance = await pipeline("text-to-speech", repo, {
1317
+ device: "webgpu",
1318
+ progress_callback: (progress) => {
1319
+ self.postMessage({ type: "progress", payload: progress });
1320
+ },
1321
+ });
1322
+
1323
+ // Load voice embeddings
1324
+ for (const voice of voices) {
1325
+ try {
1326
+ const voiceUrl = "https://huggingface.co/" + repo + "/resolve/main/voices/" + voice.id + ".bin";
1327
+ const response = await fetch(voiceUrl);
1328
+ if (response.ok) {
1329
+ const buffer = await response.arrayBuffer();
1330
+ voiceEmbeddings.set(voice.id, new Float32Array(buffer));
1331
+ }
1332
+ } catch (err) {
1333
+ console.warn("Failed to load voice:", voice.id, err);
1334
+ }
1335
+ }
1336
+
1337
+ // Warmup
1338
+ try {
1339
+ await ttsInstance("Hello", {
1340
+ speaker_embeddings: new Float32Array(1 * 101 * 128),
1341
+ num_inference_steps: 1,
1342
+ speed: 1.0,
1343
+ });
1344
+ } catch (e) {
1345
+ console.warn("Warmup failed:", e);
1346
+ }
1347
+ } else {
1348
+ // Load Kokoro using kokoro-js from CDN
1349
+ const kokoroModule = await import("https://cdn.jsdelivr.net/npm/kokoro-js@1.2.1/dist/kokoro.web.min.js");
1350
+ const { KokoroTTS } = kokoroModule;
1351
+
1352
+ kokoroTTS = await KokoroTTS.from_pretrained(repo, {
1353
+ dtype: "fp32",
1354
+ progress_callback: (progress) => {
1355
+ self.postMessage({ type: "progress", payload: progress });
1356
+ },
1357
+ });
1358
+ }
1359
+
1360
+ self.postMessage({ type: "ready" });
1361
+ } catch (err) {
1362
+ self.postMessage({ type: "error", payload: err.message || String(err) });
1363
+ }
1364
+ }
1365
+
1366
+ if (type === "generate") {
1367
+ try {
1368
+ const { text, voice, speed } = payload;
1369
+ let audio, sampleRate;
1370
+
1371
+ if (modelType === "supertonic") {
1372
+ let embedding = voiceEmbeddings.get(voice);
1373
+ if (!embedding) {
1374
+ embedding = new Float32Array(101 * 128).fill(0.1);
1375
+ }
1376
+
1377
+ const result = await ttsInstance(text, {
1378
+ speaker_embeddings: embedding,
1379
+ speed: speed || 1.0,
1380
+ });
1381
+
1382
+ audio = result.audio;
1383
+ sampleRate = result.sampling_rate;
1384
+ } else {
1385
+ const result = await kokoroTTS.generate(text, {
1386
+ voice: voice,
1387
+ speed: speed || 1.0,
1388
+ });
1389
+
1390
+ audio = result.audio;
1391
+ sampleRate = result.sampling_rate;
1392
+ }
1393
+
1394
+ // Transfer audio data back
1395
+ self.postMessage(
1396
+ { type: "audio", payload: { audio: audio, sampleRate: sampleRate } },
1397
+ [audio.buffer]
1398
+ );
1399
+ } catch (err) {
1400
+ self.postMessage({ type: "error", payload: err.message || String(err) });
1401
+ }
1402
+ }
1403
+ };
1404
+ `;
1405
+ /** Create TTS worker instance */
1406
+ function createTTSWorker() {
1407
+ const blob = new Blob([TTS_WORKER_CODE], { type: "application/javascript" });
1408
+ const url = URL.createObjectURL(blob);
1409
+ const worker = new Worker(url, { type: "module" });
1410
+ URL.revokeObjectURL(url);
1411
+ return worker;
1412
+ }
1293
1413
  /**
1294
1414
  * React hook for text-to-speech with Web Audio API playback
1295
1415
  *
@@ -1339,17 +1459,17 @@ function useSpeech(options = {}) {
1339
1459
  const [shouldLoad, setShouldLoad] = useState(autoLoad);
1340
1460
  const [currentVoice, setCurrentVoice] = useState(defaultVoice);
1341
1461
  const [currentSpeed, setCurrentSpeed] = useState(defaultSpeed);
1342
- const ttsRef = useRef(null);
1343
- const voiceEmbeddingsRef = useRef(/* @__PURE__ */ new Map());
1462
+ const workerRef = useRef(null);
1344
1463
  const audioContextRef = useRef(null);
1345
1464
  const sourceNodeRef = useRef(null);
1346
1465
  const mountedRef = useRef(true);
1347
1466
  const modelIdRef = useRef(modelId);
1467
+ const pendingSpeakRef = useRef(null);
1348
1468
  const listVoices = useCallback(() => {
1349
1469
  return modelConfig.voices;
1350
1470
  }, [modelConfig.voices]);
1351
1471
  const load = useCallback(() => {
1352
- if (ttsRef.current || isLoading) return;
1472
+ if (workerRef.current || isLoading) return;
1353
1473
  setIsLoading(true);
1354
1474
  setShouldLoad(true);
1355
1475
  }, [isLoading]);
@@ -1357,87 +1477,48 @@ function useSpeech(options = {}) {
1357
1477
  if (!shouldLoad) return;
1358
1478
  mountedRef.current = true;
1359
1479
  modelIdRef.current = modelId;
1360
- const initTTS = async () => {
1361
- try {
1362
- const isSupertonic = modelId === "supertonic-66m";
1363
- const config = TTS_MODELS[modelId];
1364
- setLoadingProgress({
1365
- status: "loading",
1366
- message: `Loading ${isSupertonic ? "Supertonic" : "Kokoro"} TTS...`
1367
- });
1368
- if (isSupertonic) {
1369
- const { pipeline, env } = await import("../transformers.web-u34VxRFM.js");
1370
- if (env.backends?.onnx?.wasm) env.backends.onnx.wasm.wasmPaths = "https://cdn.jsdelivr.net/npm/onnxruntime-web@1.21.0/dist/";
1371
- const tts = await pipeline("text-to-speech", config.repo, {
1372
- device: "webgpu",
1373
- progress_callback: (progress) => {
1374
- if (!mountedRef.current) return;
1375
- if (progress.status === "progress" && progress.file) setLoadingProgress({
1376
- status: "downloading",
1377
- file: progress.file,
1378
- progress: Math.round(progress.progress || 0)
1379
- });
1380
- }
1381
- });
1382
- if (!mountedRef.current) return;
1383
- const voicesUrl = `https://huggingface.co/${config.repo}/resolve/main/voices/`;
1384
- const embeddingsMap = /* @__PURE__ */ new Map();
1385
- await Promise.all(config.voices.map(async (voice) => {
1386
- try {
1387
- const response = await fetch(`${voicesUrl}${voice.id}.bin`);
1388
- if (response.ok) {
1389
- const buffer = await response.arrayBuffer();
1390
- embeddingsMap.set(voice.id, new Float32Array(buffer));
1391
- }
1392
- } catch (e) {
1393
- console.warn(`Failed to load voice embedding for ${voice.id}:`, e);
1394
- }
1395
- }));
1396
- if (!mountedRef.current) return;
1397
- try {
1398
- await tts("Hello", {
1399
- speaker_embeddings: new Float32Array(12928),
1400
- num_inference_steps: 1,
1401
- speed: 1
1402
- });
1403
- } catch (e) {
1404
- console.warn("Supertonic warmup failed:", e);
1405
- }
1406
- voiceEmbeddingsRef.current = embeddingsMap;
1407
- ttsRef.current = {
1408
- type: "supertonic",
1409
- pipeline: tts,
1410
- config
1411
- };
1412
- } else {
1413
- const { KokoroTTS } = await import("../kokoro-CMOGDSgT.js");
1414
- const tts = await KokoroTTS.from_pretrained(config.repo, {
1415
- dtype: "fp32",
1416
- progress_callback: (progress) => {
1417
- if (!mountedRef.current) return;
1418
- if (progress.status === "progress" && progress.file) setLoadingProgress({
1419
- status: "downloading",
1420
- file: progress.file,
1421
- progress: Math.round(progress.progress || 0)
1422
- });
1423
- }
1424
- });
1425
- if (!mountedRef.current) return;
1426
- ttsRef.current = {
1427
- type: "kokoro",
1428
- instance: tts,
1429
- config
1430
- };
1431
- }
1480
+ const config = TTS_MODELS[modelId];
1481
+ setLoadingProgress({
1482
+ status: "loading",
1483
+ message: `Loading ${modelId === "supertonic-66m" ? "Supertonic" : "Kokoro"} TTS...`
1484
+ });
1485
+ const worker = createTTSWorker();
1486
+ workerRef.current = worker;
1487
+ worker.onmessage = (e) => {
1488
+ if (!mountedRef.current) return;
1489
+ const { type, payload } = e.data;
1490
+ if (type === "progress" && payload.status === "progress" && payload.file) setLoadingProgress({
1491
+ status: "downloading",
1492
+ file: payload.file,
1493
+ progress: Math.round(payload.progress || 0)
1494
+ });
1495
+ if (type === "ready") {
1432
1496
  setIsLoading(false);
1433
1497
  setIsReady(true);
1434
1498
  setLoadingProgress({ status: "ready" });
1435
1499
  onReady?.();
1436
- } catch (err) {
1437
- if (!mountedRef.current) return;
1438
- const errorMsg = err instanceof Error ? err.message : String(err);
1500
+ if (pendingSpeakRef.current) {
1501
+ const { text, voice, speed } = pendingSpeakRef.current;
1502
+ pendingSpeakRef.current = null;
1503
+ worker.postMessage({
1504
+ type: "generate",
1505
+ payload: {
1506
+ text,
1507
+ voice,
1508
+ speed
1509
+ }
1510
+ });
1511
+ }
1512
+ }
1513
+ if (type === "audio") {
1514
+ const { audio, sampleRate } = payload;
1515
+ playAudioData(audio, sampleRate);
1516
+ }
1517
+ if (type === "error") {
1518
+ const errorMsg = payload;
1439
1519
  setError(errorMsg);
1440
1520
  setIsLoading(false);
1521
+ setIsSpeaking(false);
1441
1522
  setLoadingProgress({
1442
1523
  status: "error",
1443
1524
  error: errorMsg
@@ -1445,9 +1526,29 @@ function useSpeech(options = {}) {
1445
1526
  onError?.(errorMsg);
1446
1527
  }
1447
1528
  };
1448
- initTTS();
1529
+ worker.onerror = (err) => {
1530
+ if (!mountedRef.current) return;
1531
+ const errorMsg = err.message || "Worker error";
1532
+ setError(errorMsg);
1533
+ setIsLoading(false);
1534
+ setLoadingProgress({
1535
+ status: "error",
1536
+ error: errorMsg
1537
+ });
1538
+ onError?.(errorMsg);
1539
+ };
1540
+ worker.postMessage({
1541
+ type: "load",
1542
+ payload: {
1543
+ modelId,
1544
+ repo: config.repo,
1545
+ voices: config.voices
1546
+ }
1547
+ });
1449
1548
  return () => {
1450
1549
  mountedRef.current = false;
1550
+ worker.terminate();
1551
+ workerRef.current = null;
1451
1552
  };
1452
1553
  }, [
1453
1554
  shouldLoad,
@@ -1455,6 +1556,30 @@ function useSpeech(options = {}) {
1455
1556
  onReady,
1456
1557
  onError
1457
1558
  ]);
1559
+ const playAudioData = useCallback(async (audio, sampleRate) => {
1560
+ try {
1561
+ if (!audioContextRef.current || audioContextRef.current.state === "closed") audioContextRef.current = new AudioContext({ sampleRate });
1562
+ const ctx = audioContextRef.current;
1563
+ if (ctx.state === "suspended") await ctx.resume();
1564
+ const audioBuffer = ctx.createBuffer(1, audio.length, sampleRate);
1565
+ audioBuffer.copyToChannel(new Float32Array(audio), 0);
1566
+ const sourceNode = ctx.createBufferSource();
1567
+ sourceNode.buffer = audioBuffer;
1568
+ sourceNode.connect(ctx.destination);
1569
+ sourceNodeRef.current = sourceNode;
1570
+ sourceNode.onended = () => {
1571
+ if (!mountedRef.current) return;
1572
+ setIsSpeaking(false);
1573
+ onEnd?.();
1574
+ };
1575
+ sourceNode.start();
1576
+ } catch (err) {
1577
+ setIsSpeaking(false);
1578
+ const errorMsg = err instanceof Error ? err.message : String(err);
1579
+ setError(errorMsg);
1580
+ onError?.(errorMsg);
1581
+ }
1582
+ }, [onEnd, onError]);
1458
1583
  useEffect(() => {
1459
1584
  return () => {
1460
1585
  try {
@@ -1469,89 +1594,46 @@ function useSpeech(options = {}) {
1469
1594
  speak: useCallback(async (text, opts) => {
1470
1595
  const voice = opts?.voice || currentVoice;
1471
1596
  const speed = opts?.speed || currentSpeed;
1472
- if (!ttsRef.current) {
1597
+ if (!modelConfig.voices.find((v) => v.id === voice)) {
1598
+ const errorMsg = `Voice "${voice}" not found. Should be one of: ${modelConfig.voices.map((v) => v.id).join(", ")}.`;
1599
+ setError(errorMsg);
1600
+ onError?.(errorMsg);
1601
+ return;
1602
+ }
1603
+ if (!workerRef.current) {
1604
+ pendingSpeakRef.current = {
1605
+ text,
1606
+ voice,
1607
+ speed
1608
+ };
1473
1609
  load();
1474
1610
  return;
1475
1611
  }
1476
- try {
1477
- setIsSpeaking(true);
1478
- onStart?.();
1479
- let audioData;
1480
- let sampleRate;
1481
- const ttsBackend = ttsRef.current;
1482
- if (ttsBackend.type === "supertonic") {
1483
- const config = ttsBackend.config;
1484
- if (!config.voices.find((v) => v.id === voice)) {
1485
- const validVoices = config.voices.map((v) => v.id).join(", ");
1486
- throw new Error(`Voice "${voice}" not found. Should be one of: ${validVoices}.`);
1487
- }
1488
- let speakerEmbedding = voiceEmbeddingsRef.current.get(voice);
1489
- if (!speakerEmbedding) try {
1490
- const voiceUrl = `https://huggingface.co/${config.repo}/resolve/main/voices/${voice}.bin`;
1491
- const response = await fetch(voiceUrl);
1492
- if (response.ok) {
1493
- const buffer = await response.arrayBuffer();
1494
- speakerEmbedding = new Float32Array(buffer);
1495
- voiceEmbeddingsRef.current.set(voice, speakerEmbedding);
1496
- } else throw new Error(`Failed to load voice: ${response.status}`);
1497
- } catch {
1498
- speakerEmbedding = new Float32Array(12928).fill(.1);
1499
- voiceEmbeddingsRef.current.set(voice, speakerEmbedding);
1500
- }
1501
- const result = await ttsBackend.pipeline(text, {
1502
- speaker_embeddings: speakerEmbedding,
1503
- speed
1504
- });
1505
- audioData = result.audio;
1506
- sampleRate = result.sampling_rate;
1507
- } else {
1508
- const config = ttsBackend.config;
1509
- if (!config.voices.find((v) => v.id === voice)) {
1510
- const validVoices = config.voices.map((v) => v.id).join(", ");
1511
- throw new Error(`Voice "${voice}" not found. Should be one of: ${validVoices}.`);
1512
- }
1513
- const result = await ttsBackend.instance.generate(text, {
1514
- voice,
1515
- speed
1516
- });
1517
- audioData = result.audio;
1518
- sampleRate = result.sampling_rate;
1519
- }
1520
- if (!mountedRef.current) return;
1521
- if (!audioContextRef.current || audioContextRef.current.state === "closed") audioContextRef.current = new AudioContext();
1522
- const audioContext = audioContextRef.current;
1523
- if (audioContext.state === "suspended") await audioContext.resume();
1524
- const audioBuffer = audioContext.createBuffer(1, audioData.length, sampleRate);
1525
- const channelData = new Float32Array(audioData);
1526
- audioBuffer.copyToChannel(channelData, 0);
1527
- if (sourceNodeRef.current) {
1528
- sourceNodeRef.current.stop();
1529
- sourceNodeRef.current.disconnect();
1530
- }
1531
- const sourceNode = audioContext.createBufferSource();
1532
- sourceNode.buffer = audioBuffer;
1533
- sourceNode.connect(audioContext.destination);
1534
- sourceNode.onended = () => {
1535
- if (mountedRef.current) {
1536
- setIsSpeaking(false);
1537
- onEnd?.();
1538
- }
1612
+ if (!isReady) {
1613
+ pendingSpeakRef.current = {
1614
+ text,
1615
+ voice,
1616
+ speed
1539
1617
  };
1540
- sourceNodeRef.current = sourceNode;
1541
- sourceNode.start();
1542
- } catch (err) {
1543
- if (!mountedRef.current) return;
1544
- const errorMsg = err instanceof Error ? err.message : String(err);
1545
- setError(errorMsg);
1546
- setIsSpeaking(false);
1547
- onError?.(errorMsg);
1618
+ return;
1548
1619
  }
1620
+ setIsSpeaking(true);
1621
+ onStart?.();
1622
+ workerRef.current.postMessage({
1623
+ type: "generate",
1624
+ payload: {
1625
+ text,
1626
+ voice,
1627
+ speed
1628
+ }
1629
+ });
1549
1630
  }, [
1550
1631
  currentVoice,
1551
1632
  currentSpeed,
1633
+ modelConfig.voices,
1552
1634
  load,
1635
+ isReady,
1553
1636
  onStart,
1554
- onEnd,
1555
1637
  onError
1556
1638
  ]),
1557
1639
  stop: useCallback(() => {
@@ -1674,6 +1756,61 @@ function createAudioPlayer(sampleRate = 24e3) {
1674
1756
  isPlaying: () => isActive
1675
1757
  };
1676
1758
  }
1759
+ const STT_WORKER_CODE = `
1760
+ // STT Worker - runs in separate thread, loads from CDN
1761
+ import { pipeline, env } from "https://cdn.jsdelivr.net/npm/@huggingface/transformers@3.8.1";
1762
+
1763
+ // Configure environment
1764
+ env.useBrowserCache = true;
1765
+ env.allowLocalModels = false;
1766
+
1767
+ let sttPipeline = null;
1768
+
1769
+ self.onmessage = async (e) => {
1770
+ const { type, payload } = e.data;
1771
+
1772
+ if (type === "load") {
1773
+ try {
1774
+ const { model } = payload;
1775
+
1776
+ // Load Whisper model
1777
+ sttPipeline = await pipeline("automatic-speech-recognition", model, {
1778
+ device: "webgpu",
1779
+ progress_callback: (progress) => {
1780
+ self.postMessage({ type: "progress", payload: progress });
1781
+ },
1782
+ });
1783
+
1784
+ self.postMessage({ type: "ready" });
1785
+ } catch (err) {
1786
+ self.postMessage({ type: "error", payload: err.message || String(err) });
1787
+ }
1788
+ }
1789
+
1790
+ if (type === "transcribe") {
1791
+ try {
1792
+ const { audio } = payload;
1793
+
1794
+ // Run transcription
1795
+ const result = await sttPipeline(audio, {
1796
+ return_timestamps: false,
1797
+ });
1798
+
1799
+ self.postMessage({ type: "transcript", payload: result.text || "" });
1800
+ } catch (err) {
1801
+ self.postMessage({ type: "error", payload: err.message || String(err) });
1802
+ }
1803
+ }
1804
+ };
1805
+ `;
1806
+ /** Create STT worker instance */
1807
+ function createSTTWorker() {
1808
+ const blob = new Blob([STT_WORKER_CODE], { type: "application/javascript" });
1809
+ const url = URL.createObjectURL(blob);
1810
+ const worker = new Worker(url, { type: "module" });
1811
+ URL.revokeObjectURL(url);
1812
+ return worker;
1813
+ }
1677
1814
  /**
1678
1815
  * React hook for voice input with browser microphone
1679
1816
  *
@@ -1731,7 +1868,7 @@ function useVoiceInput(options = {}) {
1731
1868
  const [chunkCount, setChunkCount] = useState(0);
1732
1869
  const [error, setError] = useState(null);
1733
1870
  const [shouldLoad, setShouldLoad] = useState(autoLoad);
1734
- const sttRef = useRef(null);
1871
+ const workerRef = useRef(null);
1735
1872
  const mediaRecorderRef = useRef(null);
1736
1873
  const audioChunksRef = useRef([]);
1737
1874
  const streamRef = useRef(null);
@@ -1739,49 +1876,66 @@ function useVoiceInput(options = {}) {
1739
1876
  const streamingIntervalRef = useRef(null);
1740
1877
  const pendingChunksRef = useRef([]);
1741
1878
  const fullTranscriptRef = useRef("");
1879
+ const transcribeResolveRef = useRef(null);
1880
+ const transcribeRejectRef = useRef(null);
1881
+ const resolveSTTModel = (modelId) => {
1882
+ return {
1883
+ "whisper-tiny": "onnx-community/whisper-tiny",
1884
+ "whisper-tiny.en": "onnx-community/whisper-tiny.en",
1885
+ "whisper-base": "onnx-community/whisper-base",
1886
+ "whisper-base.en": "onnx-community/whisper-base.en",
1887
+ "whisper-small": "onnx-community/whisper-small",
1888
+ "whisper-small.en": "onnx-community/whisper-small.en"
1889
+ }[modelId] || modelId;
1890
+ };
1742
1891
  useEffect(() => {
1743
1892
  if (!shouldLoad || isReady) return;
1744
- let cancelled = false;
1745
- const loadModel = async () => {
1746
- try {
1747
- setIsLoading(true);
1748
- setLoadingProgress({
1749
- status: "loading",
1750
- message: "Loading STT model..."
1751
- });
1752
- onProgress?.({
1753
- status: "loading",
1754
- message: "Loading STT model..."
1755
- });
1756
- const { WhisperSTT } = await import("../stt-Dne6SENv.js");
1757
- if (cancelled || !mountedRef.current) return;
1758
- const stt = new WhisperSTT(model);
1759
- await stt.load({ onProgress: (p) => {
1760
- if (!mountedRef.current) return;
1761
- const progress = {
1762
- status: p.progress !== void 0 ? "downloading" : "loading",
1763
- message: p.status,
1764
- progress: p.progress,
1765
- file: p.file
1766
- };
1767
- setLoadingProgress(progress);
1768
- onProgress?.(progress);
1769
- } });
1770
- if (cancelled || !mountedRef.current) {
1771
- stt.dispose();
1772
- return;
1773
- }
1774
- sttRef.current = stt;
1893
+ mountedRef.current = true;
1894
+ setIsLoading(true);
1895
+ setLoadingProgress({
1896
+ status: "loading",
1897
+ message: "Loading STT model..."
1898
+ });
1899
+ onProgress?.({
1900
+ status: "loading",
1901
+ message: "Loading STT model..."
1902
+ });
1903
+ const worker = createSTTWorker();
1904
+ workerRef.current = worker;
1905
+ worker.onmessage = (e) => {
1906
+ if (!mountedRef.current) return;
1907
+ const { type, payload } = e.data;
1908
+ if (type === "progress") {
1909
+ const progress = {
1910
+ status: payload.progress !== void 0 ? "downloading" : "loading",
1911
+ message: payload.status,
1912
+ progress: payload.progress,
1913
+ file: payload.file
1914
+ };
1915
+ setLoadingProgress(progress);
1916
+ onProgress?.(progress);
1917
+ }
1918
+ if (type === "ready") {
1775
1919
  setIsReady(true);
1776
1920
  setIsLoading(false);
1777
1921
  setLoadingProgress({ status: "ready" });
1778
1922
  onProgress?.({ status: "ready" });
1779
1923
  onReady?.();
1780
- } catch (e) {
1781
- if (!mountedRef.current) return;
1782
- const errMsg = e.message || "Failed to load STT model";
1924
+ }
1925
+ if (type === "transcript") {
1926
+ const text = payload;
1927
+ setIsTranscribing(false);
1928
+ if (transcribeResolveRef.current) {
1929
+ transcribeResolveRef.current(text);
1930
+ transcribeResolveRef.current = null;
1931
+ transcribeRejectRef.current = null;
1932
+ }
1933
+ }
1934
+ if (type === "error") {
1935
+ const errMsg = payload;
1783
1936
  setError(errMsg);
1784
1937
  setIsLoading(false);
1938
+ setIsTranscribing(false);
1785
1939
  setLoadingProgress({
1786
1940
  status: "error",
1787
1941
  message: errMsg
@@ -1791,11 +1945,36 @@ function useVoiceInput(options = {}) {
1791
1945
  message: errMsg
1792
1946
  });
1793
1947
  onError?.(errMsg);
1948
+ if (transcribeRejectRef.current) {
1949
+ transcribeRejectRef.current(new Error(errMsg));
1950
+ transcribeResolveRef.current = null;
1951
+ transcribeRejectRef.current = null;
1952
+ }
1794
1953
  }
1795
1954
  };
1796
- loadModel();
1955
+ worker.onerror = (err) => {
1956
+ if (!mountedRef.current) return;
1957
+ const errMsg = err.message || "Worker error";
1958
+ setError(errMsg);
1959
+ setIsLoading(false);
1960
+ setLoadingProgress({
1961
+ status: "error",
1962
+ message: errMsg
1963
+ });
1964
+ onProgress?.({
1965
+ status: "error",
1966
+ message: errMsg
1967
+ });
1968
+ onError?.(errMsg);
1969
+ };
1970
+ worker.postMessage({
1971
+ type: "load",
1972
+ payload: { model: resolveSTTModel(model) }
1973
+ });
1797
1974
  return () => {
1798
- cancelled = true;
1975
+ mountedRef.current = false;
1976
+ worker.terminate();
1977
+ workerRef.current = null;
1799
1978
  };
1800
1979
  }, [
1801
1980
  shouldLoad,
@@ -1809,7 +1988,10 @@ function useVoiceInput(options = {}) {
1809
1988
  mountedRef.current = true;
1810
1989
  return () => {
1811
1990
  mountedRef.current = false;
1812
- if (sttRef.current) sttRef.current.dispose();
1991
+ if (workerRef.current) {
1992
+ workerRef.current.terminate();
1993
+ workerRef.current = null;
1994
+ }
1813
1995
  if (streamRef.current) for (const track of streamRef.current.getTracks()) track.stop();
1814
1996
  };
1815
1997
  }, []);
@@ -1843,27 +2025,38 @@ function useVoiceInput(options = {}) {
1843
2025
  return new Float32Array(channelData);
1844
2026
  }, []);
1845
2027
  const transcribe = useCallback(async (audio) => {
1846
- if (!sttRef.current) {
2028
+ if (!workerRef.current) {
1847
2029
  if (!shouldLoad) {
1848
2030
  setShouldLoad(true);
1849
2031
  throw new Error("STT model not loaded. Loading now, please try again.");
1850
2032
  }
1851
2033
  throw new Error("STT model not loaded");
1852
2034
  }
2035
+ if (!isReady) throw new Error("STT model still loading");
1853
2036
  setIsTranscribing(true);
1854
- try {
1855
- let text = (await sttRef.current.transcribe(audio)).text.trim();
1856
- if (text === "[BLANK_AUDIO]" || text === "(blank audio)" || text === "[BLANK AUDIO]") text = "";
1857
- setTranscript(text);
1858
- onTranscript?.(text);
1859
- return text;
1860
- } finally {
1861
- if (mountedRef.current) setIsTranscribing(false);
1862
- }
1863
- }, [shouldLoad, onTranscript]);
2037
+ return new Promise((resolve, reject) => {
2038
+ transcribeResolveRef.current = (text) => {
2039
+ let filtered = text.trim();
2040
+ if (filtered === "[BLANK_AUDIO]" || filtered === "(blank audio)" || filtered === "[BLANK AUDIO]") filtered = "";
2041
+ setTranscript(filtered);
2042
+ onTranscript?.(filtered);
2043
+ resolve(filtered);
2044
+ };
2045
+ transcribeRejectRef.current = reject;
2046
+ const audioArray = new Float32Array(audio);
2047
+ workerRef.current.postMessage({
2048
+ type: "transcribe",
2049
+ payload: { audio: audioArray }
2050
+ }, [audioArray.buffer]);
2051
+ });
2052
+ }, [
2053
+ shouldLoad,
2054
+ isReady,
2055
+ onTranscript
2056
+ ]);
1864
2057
  const processedSamplesRef = useRef(0);
1865
2058
  const transcribeChunk = useCallback(async (chunkIdx) => {
1866
- if (!sttRef.current || audioChunksRef.current.length === 0) return "";
2059
+ if (!workerRef.current || !isReady || audioChunksRef.current.length === 0) return "";
1867
2060
  try {
1868
2061
  const audioData = await blobToFloat32(new Blob(audioChunksRef.current, { type: "audio/webm" }));
1869
2062
  const newSamplesStart = processedSamplesRef.current;
@@ -1871,8 +2064,7 @@ function useVoiceInput(options = {}) {
1871
2064
  if (totalSamples - newSamplesStart < 8e3) return "";
1872
2065
  const newAudio = audioData.slice(newSamplesStart);
1873
2066
  processedSamplesRef.current = totalSamples;
1874
- let text = (await sttRef.current.transcribe(newAudio)).text.trim();
1875
- if (text === "[BLANK_AUDIO]" || text === "(blank audio)" || text === "[BLANK AUDIO]") text = "";
2067
+ const text = await transcribe(newAudio);
1876
2068
  if (text && mountedRef.current) {
1877
2069
  setStreamingChunk(text);
1878
2070
  onChunk?.(text, chunkIdx);
@@ -1881,38 +2073,30 @@ function useVoiceInput(options = {}) {
1881
2073
  } catch {
1882
2074
  return "";
1883
2075
  }
1884
- }, [blobToFloat32, onChunk]);
2076
+ }, [
2077
+ blobToFloat32,
2078
+ isReady,
2079
+ transcribe,
2080
+ onChunk
2081
+ ]);
1885
2082
  return {
1886
2083
  startRecording: useCallback(async () => {
1887
2084
  if (isRecording) return;
1888
2085
  try {
1889
- if (streaming && !sttRef.current) {
2086
+ if (streaming && !isReady) {
1890
2087
  if (!shouldLoad) setShouldLoad(true);
1891
- setIsLoading(true);
1892
- const { WhisperSTT } = await import("../stt-Dne6SENv.js");
1893
- const stt = new WhisperSTT(model);
1894
- await stt.load({ onProgress: (p) => {
1895
- if (mountedRef.current) {
1896
- const progress = {
1897
- status: p.status === "downloading" ? "downloading" : p.status === "ready" ? "ready" : "loading",
1898
- message: p.status,
1899
- progress: p.progress,
1900
- file: p.file
1901
- };
1902
- setLoadingProgress(progress);
1903
- onProgress?.(progress);
1904
- }
1905
- } });
1906
- if (!mountedRef.current) {
1907
- stt.dispose();
1908
- return;
1909
- }
1910
- sttRef.current = stt;
1911
- setIsReady(true);
1912
- setIsLoading(false);
1913
- setLoadingProgress({ status: "ready" });
1914
- onProgress?.({ status: "ready" });
1915
- onReady?.();
2088
+ await new Promise((resolve, reject) => {
2089
+ const checkReady = setInterval(() => {
2090
+ if (isReady && workerRef.current) {
2091
+ clearInterval(checkReady);
2092
+ resolve();
2093
+ }
2094
+ }, 100);
2095
+ setTimeout(() => {
2096
+ clearInterval(checkReady);
2097
+ reject(/* @__PURE__ */ new Error("Timeout waiting for STT model"));
2098
+ }, 6e4);
2099
+ });
1916
2100
  }
1917
2101
  const stream = await navigator.mediaDevices.getUserMedia({ audio: {
1918
2102
  sampleRate: 16e3,
@@ -1939,7 +2123,7 @@ function useVoiceInput(options = {}) {
1939
2123
  mediaRecorder.start(100);
1940
2124
  setIsRecording(true);
1941
2125
  setError(null);
1942
- if (streaming && sttRef.current) {
2126
+ if (streaming && isReady && workerRef.current) {
1943
2127
  let chunkIdx = 0;
1944
2128
  let shouldContinue = true;
1945
2129
  const processNextChunk = async () => {
@@ -2029,11 +2213,11 @@ function useVoiceInput(options = {}) {
2029
2213
  }
2030
2214
  const audioBlob = new Blob(audioChunksRef.current, { type: "audio/webm" });
2031
2215
  try {
2032
- if (!sttRef.current) {
2216
+ if (!isReady || !workerRef.current) {
2033
2217
  if (!shouldLoad) setShouldLoad(true);
2034
2218
  await new Promise((res, rej) => {
2035
2219
  const checkReady = setInterval(() => {
2036
- if (sttRef.current) {
2220
+ if (isReady && workerRef.current) {
2037
2221
  clearInterval(checkReady);
2038
2222
  res();
2039
2223
  }
@@ -2164,6 +2348,16 @@ function useVoiceChat(options = {}) {
2164
2348
  const isListening = stage === "listening";
2165
2349
  const isProcessing = stage === "transcribing" || stage === "thinking";
2166
2350
  const isSpeaking = stage === "speaking";
2351
+ const resolveSTTModel = (modelId) => {
2352
+ return {
2353
+ "whisper-tiny": "onnx-community/whisper-tiny",
2354
+ "whisper-tiny.en": "onnx-community/whisper-tiny.en",
2355
+ "whisper-base": "onnx-community/whisper-base",
2356
+ "whisper-base.en": "onnx-community/whisper-base.en",
2357
+ "whisper-small": "onnx-community/whisper-small",
2358
+ "whisper-small.en": "onnx-community/whisper-small.en"
2359
+ }[modelId] || modelId;
2360
+ };
2167
2361
  useEffect(() => {
2168
2362
  if (!shouldLoad || isReady) return;
2169
2363
  let cancelled = false;
@@ -2172,18 +2366,29 @@ function useVoiceChat(options = {}) {
2172
2366
  setIsLoading(true);
2173
2367
  setError(null);
2174
2368
  setLoadingMessage("Loading speech recognition (Whisper)...");
2175
- const { WhisperSTT } = await import("../stt-Dne6SENv.js");
2176
- if (cancelled || !mountedRef.current) return;
2177
- const stt = new WhisperSTT(sttModel);
2178
- await stt.load({ onProgress: (p) => {
2179
- if (!mountedRef.current) return;
2180
- setLoadingMessage(p.status || "Loading STT...");
2181
- } });
2369
+ const sttWorker = createSTTWorker();
2370
+ if (cancelled || !mountedRef.current) {
2371
+ sttWorker.terminate();
2372
+ return;
2373
+ }
2374
+ await new Promise((resolve, reject) => {
2375
+ sttWorker.onmessage = (e) => {
2376
+ const { type, payload } = e.data;
2377
+ if (type === "ready") resolve();
2378
+ if (type === "error") reject(new Error(payload));
2379
+ if (type === "progress" && mountedRef.current) setLoadingMessage(payload.status || "Loading STT...");
2380
+ };
2381
+ sttWorker.onerror = (e) => reject(new Error(e.message));
2382
+ sttWorker.postMessage({
2383
+ type: "load",
2384
+ payload: { model: resolveSTTModel(sttModel) }
2385
+ });
2386
+ });
2182
2387
  if (cancelled || !mountedRef.current) {
2183
- stt.dispose();
2388
+ sttWorker.terminate();
2184
2389
  return;
2185
2390
  }
2186
- sttRef.current = stt;
2391
+ sttRef.current = sttWorker;
2187
2392
  setLoadingMessage("Loading language model...");
2188
2393
  const worker = await createGerbilWorker({
2189
2394
  modelId: llmModel,
@@ -2198,18 +2403,34 @@ function useVoiceChat(options = {}) {
2198
2403
  }
2199
2404
  llmWorkerRef.current = worker;
2200
2405
  setLoadingMessage(`Loading text-to-speech (${ttsModelId === "supertonic-66m" ? "Supertonic" : "Kokoro"})...`);
2201
- const { createTTS } = await import("../tts-C2FzKuSx.js");
2202
- if (cancelled || !mountedRef.current) return;
2203
- const tts = createTTS(ttsModelId);
2204
- await tts.load({ onProgress: (p) => {
2205
- if (!mountedRef.current) return;
2206
- setLoadingMessage(p.status || "Loading TTS...");
2207
- } });
2406
+ const ttsWorker = createTTSWorker();
2407
+ if (cancelled || !mountedRef.current) {
2408
+ ttsWorker.terminate();
2409
+ return;
2410
+ }
2411
+ const ttsConfig$1 = TTS_MODELS[ttsModelId];
2412
+ await new Promise((resolve, reject) => {
2413
+ ttsWorker.onmessage = (e) => {
2414
+ const { type, payload } = e.data;
2415
+ if (type === "ready") resolve();
2416
+ if (type === "error") reject(new Error(payload));
2417
+ if (type === "progress" && mountedRef.current) setLoadingMessage(payload.status || "Loading TTS...");
2418
+ };
2419
+ ttsWorker.onerror = (e) => reject(new Error(e.message));
2420
+ ttsWorker.postMessage({
2421
+ type: "load",
2422
+ payload: {
2423
+ modelId: ttsModelId,
2424
+ repo: ttsConfig$1.repo,
2425
+ voices: ttsConfig$1.voices
2426
+ }
2427
+ });
2428
+ });
2208
2429
  if (cancelled || !mountedRef.current) {
2209
- await tts.dispose();
2430
+ ttsWorker.terminate();
2210
2431
  return;
2211
2432
  }
2212
- ttsRef.current = tts;
2433
+ ttsRef.current = ttsWorker;
2213
2434
  setIsReady(true);
2214
2435
  setIsLoading(false);
2215
2436
  setLoadingMessage("Ready!");
@@ -2238,8 +2459,8 @@ function useVoiceChat(options = {}) {
2238
2459
  return () => {
2239
2460
  mountedRef.current = false;
2240
2461
  llmWorkerRef.current?.terminate();
2241
- sttRef.current?.dispose();
2242
- ttsRef.current?.dispose();
2462
+ sttRef.current?.terminate();
2463
+ ttsRef.current?.terminate();
2243
2464
  if (streamRef.current) for (const track of streamRef.current.getTracks()) track.stop();
2244
2465
  audioContextRef.current?.close();
2245
2466
  };
@@ -2345,7 +2566,26 @@ function useVoiceChat(options = {}) {
2345
2566
  try {
2346
2567
  setStage("transcribing");
2347
2568
  const audioData = await blobToFloat32(audioBlob);
2348
- let userText = (await sttRef.current.transcribe(audioData)).text.trim();
2569
+ let userText = await new Promise((sttResolve, sttReject) => {
2570
+ const handler = (e) => {
2571
+ const { type, payload } = e.data;
2572
+ if (type === "transcript") {
2573
+ sttRef.current?.removeEventListener("message", handler);
2574
+ sttResolve(payload);
2575
+ }
2576
+ if (type === "error") {
2577
+ sttRef.current?.removeEventListener("message", handler);
2578
+ sttReject(new Error(payload));
2579
+ }
2580
+ };
2581
+ sttRef.current?.addEventListener("message", handler);
2582
+ const audioArray = new Float32Array(audioData);
2583
+ sttRef.current?.postMessage({
2584
+ type: "transcribe",
2585
+ payload: { audio: audioArray }
2586
+ }, [audioArray.buffer]);
2587
+ });
2588
+ userText = userText.trim();
2349
2589
  if (userText === "[BLANK_AUDIO]" || userText === "(blank audio)" || userText === "[BLANK AUDIO]") userText = "";
2350
2590
  if (cancelledRef.current || !userText) {
2351
2591
  setStage("idle");
@@ -2395,9 +2635,30 @@ function useVoiceChat(options = {}) {
2395
2635
  onAssistantSpeak?.(responseText);
2396
2636
  if (responseText.trim()) {
2397
2637
  setStage("speaking");
2398
- const ttsResult = await ttsRef.current.speak(responseText, {
2399
- voice,
2400
- speed
2638
+ const ttsResult = await new Promise((ttsResolve, ttsReject) => {
2639
+ const handler = (e) => {
2640
+ const { type, payload } = e.data;
2641
+ if (type === "audio") {
2642
+ ttsRef.current?.removeEventListener("message", handler);
2643
+ ttsResolve({
2644
+ audio: payload.audio,
2645
+ sampleRate: payload.sampleRate
2646
+ });
2647
+ }
2648
+ if (type === "error") {
2649
+ ttsRef.current?.removeEventListener("message", handler);
2650
+ ttsReject(new Error(payload));
2651
+ }
2652
+ };
2653
+ ttsRef.current?.addEventListener("message", handler);
2654
+ ttsRef.current?.postMessage({
2655
+ type: "generate",
2656
+ payload: {
2657
+ text: responseText,
2658
+ voice,
2659
+ speed
2660
+ }
2661
+ });
2401
2662
  });
2402
2663
  if (!cancelledRef.current) await playAudioBuffer(ttsResult.audio, ttsResult.sampleRate);
2403
2664
  }