npm - @tryhamster/gerbil - Versions diffs - 1.0.0-rc.10 → 1.0.0-rc.11 - Mend

@tryhamster/gerbil 1.0.0-rc.10 → 1.0.0-rc.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (20) hide show

package/README.md +14 -5
package/dist/browser/index.d.ts.map +1 -1
package/dist/browser/index.js +519 -258
package/dist/browser/index.js.map +1 -1
package/dist/cli.mjs +1 -1
package/dist/cli.mjs.map +1 -1
package/dist/skills/index.d.mts +22 -22
package/dist/skills/index.d.mts.map +1 -1
package/docs/architecture/overview.md +15 -7
package/docs/tts.md +11 -8
package/package.json +1 -1
package/dist/kokoro-CMOGDSgT.js +0 -20212
package/dist/kokoro-CMOGDSgT.js.map +0 -1
package/dist/stt-Dne6SENv.js +0 -434
package/dist/stt-Dne6SENv.js.map +0 -1
package/dist/transformers.web-DiD1gTwk.js +0 -44695
package/dist/transformers.web-DiD1gTwk.js.map +0 -1
package/dist/transformers.web-u34VxRFM.js +0 -3
package/dist/tts-C2FzKuSx.js +0 -725
package/dist/tts-C2FzKuSx.js.map +0 -1

package/dist/browser/index.js CHANGED Viewed

@@ -1290,6 +1290,126 @@ const TTS_MODELS = {
 		voices: SUPERTONIC_BROWSER_VOICES
 	}
 };
+const TTS_WORKER_CODE = `
+  // TTS Worker - runs in separate thread, loads from CDN
+  import { pipeline, env } from "https://cdn.jsdelivr.net/npm/@huggingface/transformers@3.8.1";
+  // Configure environment
+  env.useBrowserCache = true;
+  env.allowLocalModels = false;
+  let ttsInstance = null;
+  let modelType = null; // "supertonic" or "kokoro"
+  let voiceEmbeddings = new Map();
+  let kokoroTTS = null;
+  self.onmessage = async (e) => {
+    const { type, payload } = e.data;
+    if (type === "load") {
+      try {
+        const { modelId, repo, voices } = payload;
+        modelType = modelId === "supertonic-66m" ? "supertonic" : "kokoro";
+        if (modelType === "supertonic") {
+          // Load Supertonic using transformers.js pipeline
+          ttsInstance = await pipeline("text-to-speech", repo, {
+            device: "webgpu",
+            progress_callback: (progress) => {
+              self.postMessage({ type: "progress", payload: progress });
+            },
+          });
+          // Load voice embeddings
+          for (const voice of voices) {
+            try {
+              const voiceUrl = "https://huggingface.co/" + repo + "/resolve/main/voices/" + voice.id + ".bin";
+              const response = await fetch(voiceUrl);
+              if (response.ok) {
+                const buffer = await response.arrayBuffer();
+                voiceEmbeddings.set(voice.id, new Float32Array(buffer));
+              }
+            } catch (err) {
+              console.warn("Failed to load voice:", voice.id, err);
+            }
+          }
+          // Warmup
+          try {
+            await ttsInstance("Hello", {
+              speaker_embeddings: new Float32Array(1 * 101 * 128),
+              num_inference_steps: 1,
+              speed: 1.0,
+            });
+          } catch (e) {
+            console.warn("Warmup failed:", e);
+          }
+        } else {
+          // Load Kokoro using kokoro-js from CDN
+          const kokoroModule = await import("https://cdn.jsdelivr.net/npm/kokoro-js@1.2.1/dist/kokoro.web.min.js");
+          const { KokoroTTS } = kokoroModule;
+          kokoroTTS = await KokoroTTS.from_pretrained(repo, {
+            dtype: "fp32",
+            progress_callback: (progress) => {
+              self.postMessage({ type: "progress", payload: progress });
+            },
+          });
+        }
+        self.postMessage({ type: "ready" });
+      } catch (err) {
+        self.postMessage({ type: "error", payload: err.message || String(err) });
+      }
+    }
+    if (type === "generate") {
+      try {
+        const { text, voice, speed } = payload;
+        let audio, sampleRate;
+        if (modelType === "supertonic") {
+          let embedding = voiceEmbeddings.get(voice);
+          if (!embedding) {
+            embedding = new Float32Array(101 * 128).fill(0.1);
+          }
+          const result = await ttsInstance(text, {
+            speaker_embeddings: embedding,
+            speed: speed || 1.0,
+          });
+          audio = result.audio;
+          sampleRate = result.sampling_rate;
+        } else {
+          const result = await kokoroTTS.generate(text, {
+            voice: voice,
+            speed: speed || 1.0,
+          });
+          audio = result.audio;
+          sampleRate = result.sampling_rate;
+        }
+        // Transfer audio data back
+        self.postMessage(
+          { type: "audio", payload: { audio: audio, sampleRate: sampleRate } },
+          [audio.buffer]
+        );
+      } catch (err) {
+        self.postMessage({ type: "error", payload: err.message || String(err) });
+      }
+    }
+  };
+`;
+/** Create TTS worker instance */
+function createTTSWorker() {
+	const blob = new Blob([TTS_WORKER_CODE], { type: "application/javascript" });
+	const url = URL.createObjectURL(blob);
+	const worker = new Worker(url, { type: "module" });
+	URL.revokeObjectURL(url);
+	return worker;
+}
 /**
 * React hook for text-to-speech with Web Audio API playback
 *
@@ -1339,17 +1459,17 @@ function useSpeech(options = {}) {
 	const [shouldLoad, setShouldLoad] = useState(autoLoad);
 	const [currentVoice, setCurrentVoice] = useState(defaultVoice);
 	const [currentSpeed, setCurrentSpeed] = useState(defaultSpeed);
-	const ttsRef = useRef(null);
-	const voiceEmbeddingsRef = useRef(/* @__PURE__ */ new Map());
+	const workerRef = useRef(null);
 	const audioContextRef = useRef(null);
 	const sourceNodeRef = useRef(null);
 	const mountedRef = useRef(true);
 	const modelIdRef = useRef(modelId);
+	const pendingSpeakRef = useRef(null);
 	const listVoices = useCallback(() => {
 		return modelConfig.voices;
 	}, [modelConfig.voices]);
 	const load = useCallback(() => {
-		if (ttsRef.current || isLoading) return;
+		if (workerRef.current || isLoading) return;
 		setIsLoading(true);
 		setShouldLoad(true);
 	}, [isLoading]);
@@ -1357,87 +1477,48 @@ function useSpeech(options = {}) {
 		if (!shouldLoad) return;
 		mountedRef.current = true;
 		modelIdRef.current = modelId;
-		const initTTS = async () => {
-			try {
-				const isSupertonic = modelId === "supertonic-66m";
-				const config = TTS_MODELS[modelId];
-				setLoadingProgress({
-					status: "loading",
-					message: `Loading ${isSupertonic ? "Supertonic" : "Kokoro"} TTS...`
-				});
-				if (isSupertonic) {
-					const { pipeline, env } = await import("../transformers.web-u34VxRFM.js");
-					if (env.backends?.onnx?.wasm) env.backends.onnx.wasm.wasmPaths = "https://cdn.jsdelivr.net/npm/onnxruntime-web@1.21.0/dist/";
-					const tts = await pipeline("text-to-speech", config.repo, {
-						device: "webgpu",
-						progress_callback: (progress) => {
-							if (!mountedRef.current) return;
-							if (progress.status === "progress" && progress.file) setLoadingProgress({
-								status: "downloading",
-								file: progress.file,
-								progress: Math.round(progress.progress || 0)
-							});
-						}
-					});
-					if (!mountedRef.current) return;
-					const voicesUrl = `https://huggingface.co/${config.repo}/resolve/main/voices/`;
-					const embeddingsMap = /* @__PURE__ */ new Map();
-					await Promise.all(config.voices.map(async (voice) => {
-						try {
-							const response = await fetch(`${voicesUrl}${voice.id}.bin`);
-							if (response.ok) {
-								const buffer = await response.arrayBuffer();
-								embeddingsMap.set(voice.id, new Float32Array(buffer));
-							}
-						} catch (e) {
-							console.warn(`Failed to load voice embedding for ${voice.id}:`, e);
-						}
-					}));
-					if (!mountedRef.current) return;
-					try {
-						await tts("Hello", {
-							speaker_embeddings: new Float32Array(12928),
-							num_inference_steps: 1,
-							speed: 1
-						});
-					} catch (e) {
-						console.warn("Supertonic warmup failed:", e);
-					}
-					voiceEmbeddingsRef.current = embeddingsMap;
-					ttsRef.current = {
-						type: "supertonic",
-						pipeline: tts,
-						config
-					};
-				} else {
-					const { KokoroTTS } = await import("../kokoro-CMOGDSgT.js");
-					const tts = await KokoroTTS.from_pretrained(config.repo, {
-						dtype: "fp32",
-						progress_callback: (progress) => {
-							if (!mountedRef.current) return;
-							if (progress.status === "progress" && progress.file) setLoadingProgress({
-								status: "downloading",
-								file: progress.file,
-								progress: Math.round(progress.progress || 0)
-							});
-						}
-					});
-					if (!mountedRef.current) return;
-					ttsRef.current = {
-						type: "kokoro",
-						instance: tts,
-						config
-					};
-				}
+		const config = TTS_MODELS[modelId];
+		setLoadingProgress({
+			status: "loading",
+			message: `Loading ${modelId === "supertonic-66m" ? "Supertonic" : "Kokoro"} TTS...`
+		});
+		const worker = createTTSWorker();
+		workerRef.current = worker;
+		worker.onmessage = (e) => {
+			if (!mountedRef.current) return;
+			const { type, payload } = e.data;
+			if (type === "progress" && payload.status === "progress" && payload.file) setLoadingProgress({
+				status: "downloading",
+				file: payload.file,
+				progress: Math.round(payload.progress || 0)
+			});
+			if (type === "ready") {
 				setIsLoading(false);
 				setIsReady(true);
 				setLoadingProgress({ status: "ready" });
 				onReady?.();
-			} catch (err) {
-				if (!mountedRef.current) return;
-				const errorMsg = err instanceof Error ? err.message : String(err);
+				if (pendingSpeakRef.current) {
+					const { text, voice, speed } = pendingSpeakRef.current;
+					pendingSpeakRef.current = null;
+					worker.postMessage({
+						type: "generate",
+						payload: {
+							text,
+							voice,
+							speed
+						}
+					});
+				}
+			}
+			if (type === "audio") {
+				const { audio, sampleRate } = payload;
+				playAudioData(audio, sampleRate);
+			}
+			if (type === "error") {
+				const errorMsg = payload;
 				setError(errorMsg);
 				setIsLoading(false);
+				setIsSpeaking(false);
 				setLoadingProgress({
 					status: "error",
 					error: errorMsg
@@ -1445,9 +1526,29 @@ function useSpeech(options = {}) {
 				onError?.(errorMsg);
 			}
 		};
-		initTTS();
+		worker.onerror = (err) => {
+			if (!mountedRef.current) return;
+			const errorMsg = err.message || "Worker error";
+			setError(errorMsg);
+			setIsLoading(false);
+			setLoadingProgress({
+				status: "error",
+				error: errorMsg
+			});
+			onError?.(errorMsg);
+		};
+		worker.postMessage({
+			type: "load",
+			payload: {
+				modelId,
+				repo: config.repo,
+				voices: config.voices
+			}
+		});
 		return () => {
 			mountedRef.current = false;
+			worker.terminate();
+			workerRef.current = null;
 		};
 	}, [
 		shouldLoad,
@@ -1455,6 +1556,30 @@ function useSpeech(options = {}) {
 		onReady,
 		onError
 	]);
+	const playAudioData = useCallback(async (audio, sampleRate) => {
+		try {
+			if (!audioContextRef.current || audioContextRef.current.state === "closed") audioContextRef.current = new AudioContext({ sampleRate });
+			const ctx = audioContextRef.current;
+			if (ctx.state === "suspended") await ctx.resume();
+			const audioBuffer = ctx.createBuffer(1, audio.length, sampleRate);
+			audioBuffer.copyToChannel(new Float32Array(audio), 0);
+			const sourceNode = ctx.createBufferSource();
+			sourceNode.buffer = audioBuffer;
+			sourceNode.connect(ctx.destination);
+			sourceNodeRef.current = sourceNode;
+			sourceNode.onended = () => {
+				if (!mountedRef.current) return;
+				setIsSpeaking(false);
+				onEnd?.();
+			};
+			sourceNode.start();
+		} catch (err) {
+			setIsSpeaking(false);
+			const errorMsg = err instanceof Error ? err.message : String(err);
+			setError(errorMsg);
+			onError?.(errorMsg);
+		}
+	}, [onEnd, onError]);
 	useEffect(() => {
 		return () => {
 			try {
@@ -1469,89 +1594,46 @@ function useSpeech(options = {}) {
 		speak: useCallback(async (text, opts) => {
 			const voice = opts?.voice || currentVoice;
 			const speed = opts?.speed || currentSpeed;
-			if (!ttsRef.current) {
+			if (!modelConfig.voices.find((v) => v.id === voice)) {
+				const errorMsg = `Voice "${voice}" not found. Should be one of: ${modelConfig.voices.map((v) => v.id).join(", ")}.`;
+				setError(errorMsg);
+				onError?.(errorMsg);
+				return;
+			}
+			if (!workerRef.current) {
+				pendingSpeakRef.current = {
+					text,
+					voice,
+					speed
+				};
 				load();
 				return;
 			}
-			try {
-				setIsSpeaking(true);
-				onStart?.();
-				let audioData;
-				let sampleRate;
-				const ttsBackend = ttsRef.current;
-				if (ttsBackend.type === "supertonic") {
-					const config = ttsBackend.config;
-					if (!config.voices.find((v) => v.id === voice)) {
-						const validVoices = config.voices.map((v) => v.id).join(", ");
-						throw new Error(`Voice "${voice}" not found. Should be one of: ${validVoices}.`);
-					}
-					let speakerEmbedding = voiceEmbeddingsRef.current.get(voice);
-					if (!speakerEmbedding) try {
-						const voiceUrl = `https://huggingface.co/${config.repo}/resolve/main/voices/${voice}.bin`;
-						const response = await fetch(voiceUrl);
-						if (response.ok) {
-							const buffer = await response.arrayBuffer();
-							speakerEmbedding = new Float32Array(buffer);
-							voiceEmbeddingsRef.current.set(voice, speakerEmbedding);
-						} else throw new Error(`Failed to load voice: ${response.status}`);
-					} catch {
-						speakerEmbedding = new Float32Array(12928).fill(.1);
-						voiceEmbeddingsRef.current.set(voice, speakerEmbedding);
-					}
-					const result = await ttsBackend.pipeline(text, {
-						speaker_embeddings: speakerEmbedding,
-						speed
-					});
-					audioData = result.audio;
-					sampleRate = result.sampling_rate;
-				} else {
-					const config = ttsBackend.config;
-					if (!config.voices.find((v) => v.id === voice)) {
-						const validVoices = config.voices.map((v) => v.id).join(", ");
-						throw new Error(`Voice "${voice}" not found. Should be one of: ${validVoices}.`);
-					}
-					const result = await ttsBackend.instance.generate(text, {
-						voice,
-						speed
-					});
-					audioData = result.audio;
-					sampleRate = result.sampling_rate;
-				}
-				if (!mountedRef.current) return;
-				if (!audioContextRef.current || audioContextRef.current.state === "closed") audioContextRef.current = new AudioContext();
-				const audioContext = audioContextRef.current;
-				if (audioContext.state === "suspended") await audioContext.resume();
-				const audioBuffer = audioContext.createBuffer(1, audioData.length, sampleRate);
-				const channelData = new Float32Array(audioData);
-				audioBuffer.copyToChannel(channelData, 0);
-				if (sourceNodeRef.current) {
-					sourceNodeRef.current.stop();
-					sourceNodeRef.current.disconnect();
-				}
-				const sourceNode = audioContext.createBufferSource();
-				sourceNode.buffer = audioBuffer;
-				sourceNode.connect(audioContext.destination);
-				sourceNode.onended = () => {
-					if (mountedRef.current) {
-						setIsSpeaking(false);
-						onEnd?.();
-					}
+			if (!isReady) {
+				pendingSpeakRef.current = {
+					text,
+					voice,
+					speed
 				};
-				sourceNodeRef.current = sourceNode;
-				sourceNode.start();
-			} catch (err) {
-				if (!mountedRef.current) return;
-				const errorMsg = err instanceof Error ? err.message : String(err);
-				setError(errorMsg);
-				setIsSpeaking(false);
-				onError?.(errorMsg);
+				return;
 			}
+			setIsSpeaking(true);
+			onStart?.();
+			workerRef.current.postMessage({
+				type: "generate",
+				payload: {
+					text,
+					voice,
+					speed
+				}
+			});
 		}, [
 			currentVoice,
 			currentSpeed,
+			modelConfig.voices,
 			load,
+			isReady,
 			onStart,
-			onEnd,
 			onError
 		]),
 		stop: useCallback(() => {
@@ -1674,6 +1756,61 @@ function createAudioPlayer(sampleRate = 24e3) {
 		isPlaying: () => isActive
 	};
 }
+const STT_WORKER_CODE = `
+  // STT Worker - runs in separate thread, loads from CDN
+  import { pipeline, env } from "https://cdn.jsdelivr.net/npm/@huggingface/transformers@3.8.1";
+  // Configure environment
+  env.useBrowserCache = true;
+  env.allowLocalModels = false;
+  let sttPipeline = null;
+  self.onmessage = async (e) => {
+    const { type, payload } = e.data;
+    if (type === "load") {
+      try {
+        const { model } = payload;
+        // Load Whisper model
+        sttPipeline = await pipeline("automatic-speech-recognition", model, {
+          device: "webgpu",
+          progress_callback: (progress) => {
+            self.postMessage({ type: "progress", payload: progress });
+          },
+        });
+        self.postMessage({ type: "ready" });
+      } catch (err) {
+        self.postMessage({ type: "error", payload: err.message || String(err) });
+      }
+    }
+    if (type === "transcribe") {
+      try {
+        const { audio } = payload;
+        // Run transcription
+        const result = await sttPipeline(audio, {
+          return_timestamps: false,
+        });
+        self.postMessage({ type: "transcript", payload: result.text || "" });
+      } catch (err) {
+        self.postMessage({ type: "error", payload: err.message || String(err) });
+      }
+    }
+  };
+`;
+/** Create STT worker instance */
+function createSTTWorker() {
+	const blob = new Blob([STT_WORKER_CODE], { type: "application/javascript" });
+	const url = URL.createObjectURL(blob);
+	const worker = new Worker(url, { type: "module" });
+	URL.revokeObjectURL(url);
+	return worker;
+}
 /**
 * React hook for voice input with browser microphone
 *
@@ -1731,7 +1868,7 @@ function useVoiceInput(options = {}) {
 	const [chunkCount, setChunkCount] = useState(0);
 	const [error, setError] = useState(null);
 	const [shouldLoad, setShouldLoad] = useState(autoLoad);
-	const sttRef = useRef(null);
+	const workerRef = useRef(null);
 	const mediaRecorderRef = useRef(null);
 	const audioChunksRef = useRef([]);
 	const streamRef = useRef(null);
@@ -1739,49 +1876,66 @@ function useVoiceInput(options = {}) {
 	const streamingIntervalRef = useRef(null);
 	const pendingChunksRef = useRef([]);
 	const fullTranscriptRef = useRef("");
+	const transcribeResolveRef = useRef(null);
+	const transcribeRejectRef = useRef(null);
+	const resolveSTTModel = (modelId) => {
+		return {
+			"whisper-tiny": "onnx-community/whisper-tiny",
+			"whisper-tiny.en": "onnx-community/whisper-tiny.en",
+			"whisper-base": "onnx-community/whisper-base",
+			"whisper-base.en": "onnx-community/whisper-base.en",
+			"whisper-small": "onnx-community/whisper-small",
+			"whisper-small.en": "onnx-community/whisper-small.en"
+		}[modelId] || modelId;
+	};
 	useEffect(() => {
 		if (!shouldLoad || isReady) return;
-		let cancelled = false;
-		const loadModel = async () => {
-			try {
-				setIsLoading(true);
-				setLoadingProgress({
-					status: "loading",
-					message: "Loading STT model..."
-				});
-				onProgress?.({
-					status: "loading",
-					message: "Loading STT model..."
-				});
-				const { WhisperSTT } = await import("../stt-Dne6SENv.js");
-				if (cancelled || !mountedRef.current) return;
-				const stt = new WhisperSTT(model);
-				await stt.load({ onProgress: (p) => {
-					if (!mountedRef.current) return;
-					const progress = {
-						status: p.progress !== void 0 ? "downloading" : "loading",
-						message: p.status,
-						progress: p.progress,
-						file: p.file
-					};
-					setLoadingProgress(progress);
-					onProgress?.(progress);
-				} });
-				if (cancelled || !mountedRef.current) {
-					stt.dispose();
-					return;
-				}
-				sttRef.current = stt;
+		mountedRef.current = true;
+		setIsLoading(true);
+		setLoadingProgress({
+			status: "loading",
+			message: "Loading STT model..."
+		});
+		onProgress?.({
+			status: "loading",
+			message: "Loading STT model..."
+		});
+		const worker = createSTTWorker();
+		workerRef.current = worker;
+		worker.onmessage = (e) => {
+			if (!mountedRef.current) return;
+			const { type, payload } = e.data;
+			if (type === "progress") {
+				const progress = {
+					status: payload.progress !== void 0 ? "downloading" : "loading",
+					message: payload.status,
+					progress: payload.progress,
+					file: payload.file
+				};
+				setLoadingProgress(progress);
+				onProgress?.(progress);
+			}
+			if (type === "ready") {
 				setIsReady(true);
 				setIsLoading(false);
 				setLoadingProgress({ status: "ready" });
 				onProgress?.({ status: "ready" });
 				onReady?.();
-			} catch (e) {
-				if (!mountedRef.current) return;
-				const errMsg = e.message || "Failed to load STT model";
+			}
+			if (type === "transcript") {
+				const text = payload;
+				setIsTranscribing(false);
+				if (transcribeResolveRef.current) {
+					transcribeResolveRef.current(text);
+					transcribeResolveRef.current = null;
+					transcribeRejectRef.current = null;
+				}
+			}
+			if (type === "error") {
+				const errMsg = payload;
 				setError(errMsg);
 				setIsLoading(false);
+				setIsTranscribing(false);
 				setLoadingProgress({
 					status: "error",
 					message: errMsg
@@ -1791,11 +1945,36 @@ function useVoiceInput(options = {}) {
 					message: errMsg
 				});
 				onError?.(errMsg);
+				if (transcribeRejectRef.current) {
+					transcribeRejectRef.current(new Error(errMsg));
+					transcribeResolveRef.current = null;
+					transcribeRejectRef.current = null;
+				}
 			}
 		};
-		loadModel();
+		worker.onerror = (err) => {
+			if (!mountedRef.current) return;
+			const errMsg = err.message || "Worker error";
+			setError(errMsg);
+			setIsLoading(false);
+			setLoadingProgress({
+				status: "error",
+				message: errMsg
+			});
+			onProgress?.({
+				status: "error",
+				message: errMsg
+			});
+			onError?.(errMsg);
+		};
+		worker.postMessage({
+			type: "load",
+			payload: { model: resolveSTTModel(model) }
+		});
 		return () => {
-			cancelled = true;
+			mountedRef.current = false;
+			worker.terminate();
+			workerRef.current = null;
 		};
 	}, [
 		shouldLoad,
@@ -1809,7 +1988,10 @@ function useVoiceInput(options = {}) {
 		mountedRef.current = true;
 		return () => {
 			mountedRef.current = false;
-			if (sttRef.current) sttRef.current.dispose();
+			if (workerRef.current) {
+				workerRef.current.terminate();
+				workerRef.current = null;
+			}
 			if (streamRef.current) for (const track of streamRef.current.getTracks()) track.stop();
 		};
 	}, []);
@@ -1843,27 +2025,38 @@ function useVoiceInput(options = {}) {
 		return new Float32Array(channelData);
 	}, []);
 	const transcribe = useCallback(async (audio) => {
-		if (!sttRef.current) {
+		if (!workerRef.current) {
 			if (!shouldLoad) {
 				setShouldLoad(true);
 				throw new Error("STT model not loaded. Loading now, please try again.");
 			}
 			throw new Error("STT model not loaded");
 		}
+		if (!isReady) throw new Error("STT model still loading");
 		setIsTranscribing(true);
-		try {
-			let text = (await sttRef.current.transcribe(audio)).text.trim();
-			if (text === "[BLANK_AUDIO]" || text === "(blank audio)" || text === "[BLANK AUDIO]") text = "";
-			setTranscript(text);
-			onTranscript?.(text);
-			return text;
-		} finally {
-			if (mountedRef.current) setIsTranscribing(false);
-		}
-	}, [shouldLoad, onTranscript]);
+		return new Promise((resolve, reject) => {
+			transcribeResolveRef.current = (text) => {
+				let filtered = text.trim();
+				if (filtered === "[BLANK_AUDIO]" || filtered === "(blank audio)" || filtered === "[BLANK AUDIO]") filtered = "";
+				setTranscript(filtered);
+				onTranscript?.(filtered);
+				resolve(filtered);
+			};
+			transcribeRejectRef.current = reject;
+			const audioArray = new Float32Array(audio);
+			workerRef.current.postMessage({
+				type: "transcribe",
+				payload: { audio: audioArray }
+			}, [audioArray.buffer]);
+		});
+	}, [
+		shouldLoad,
+		isReady,
+		onTranscript
+	]);
 	const processedSamplesRef = useRef(0);
 	const transcribeChunk = useCallback(async (chunkIdx) => {
-		if (!sttRef.current || audioChunksRef.current.length === 0) return "";
+		if (!workerRef.current || !isReady || audioChunksRef.current.length === 0) return "";
 		try {
 			const audioData = await blobToFloat32(new Blob(audioChunksRef.current, { type: "audio/webm" }));
 			const newSamplesStart = processedSamplesRef.current;
@@ -1871,8 +2064,7 @@ function useVoiceInput(options = {}) {
 			if (totalSamples - newSamplesStart < 8e3) return "";
 			const newAudio = audioData.slice(newSamplesStart);
 			processedSamplesRef.current = totalSamples;
-			let text = (await sttRef.current.transcribe(newAudio)).text.trim();
-			if (text === "[BLANK_AUDIO]" || text === "(blank audio)" || text === "[BLANK AUDIO]") text = "";
+			const text = await transcribe(newAudio);
 			if (text && mountedRef.current) {
 				setStreamingChunk(text);
 				onChunk?.(text, chunkIdx);
@@ -1881,38 +2073,30 @@ function useVoiceInput(options = {}) {
 		} catch {
 			return "";
 		}
-	}, [blobToFloat32, onChunk]);
+	}, [
+		blobToFloat32,
+		isReady,
+		transcribe,
+		onChunk
+	]);
 	return {
 		startRecording: useCallback(async () => {
 			if (isRecording) return;
 			try {
-				if (streaming && !sttRef.current) {
+				if (streaming && !isReady) {
 					if (!shouldLoad) setShouldLoad(true);
-					setIsLoading(true);
-					const { WhisperSTT } = await import("../stt-Dne6SENv.js");
-					const stt = new WhisperSTT(model);
-					await stt.load({ onProgress: (p) => {
-						if (mountedRef.current) {
-							const progress = {
-								status: p.status === "downloading" ? "downloading" : p.status === "ready" ? "ready" : "loading",
-								message: p.status,
-								progress: p.progress,
-								file: p.file
-							};
-							setLoadingProgress(progress);
-							onProgress?.(progress);
-						}
-					} });
-					if (!mountedRef.current) {
-						stt.dispose();
-						return;
-					}
-					sttRef.current = stt;
-					setIsReady(true);
-					setIsLoading(false);
-					setLoadingProgress({ status: "ready" });
-					onProgress?.({ status: "ready" });
-					onReady?.();
+					await new Promise((resolve, reject) => {
+						const checkReady = setInterval(() => {
+							if (isReady && workerRef.current) {
+								clearInterval(checkReady);
+								resolve();
+							}
+						}, 100);
+						setTimeout(() => {
+							clearInterval(checkReady);
+							reject(/* @__PURE__ */ new Error("Timeout waiting for STT model"));
+						}, 6e4);
+					});
 				}
 				const stream = await navigator.mediaDevices.getUserMedia({ audio: {
 					sampleRate: 16e3,
@@ -1939,7 +2123,7 @@ function useVoiceInput(options = {}) {
 				mediaRecorder.start(100);
 				setIsRecording(true);
 				setError(null);
-				if (streaming && sttRef.current) {
+				if (streaming && isReady && workerRef.current) {
 					let chunkIdx = 0;
 					let shouldContinue = true;
 					const processNextChunk = async () => {
@@ -2029,11 +2213,11 @@ function useVoiceInput(options = {}) {
 					}
 					const audioBlob = new Blob(audioChunksRef.current, { type: "audio/webm" });
 					try {
-						if (!sttRef.current) {
+						if (!isReady || !workerRef.current) {
 							if (!shouldLoad) setShouldLoad(true);
 							await new Promise((res, rej) => {
 								const checkReady = setInterval(() => {
-									if (sttRef.current) {
+									if (isReady && workerRef.current) {
 										clearInterval(checkReady);
 										res();
 									}
@@ -2164,6 +2348,16 @@ function useVoiceChat(options = {}) {
 	const isListening = stage === "listening";
 	const isProcessing = stage === "transcribing" || stage === "thinking";
 	const isSpeaking = stage === "speaking";
+	const resolveSTTModel = (modelId) => {
+		return {
+			"whisper-tiny": "onnx-community/whisper-tiny",
+			"whisper-tiny.en": "onnx-community/whisper-tiny.en",
+			"whisper-base": "onnx-community/whisper-base",
+			"whisper-base.en": "onnx-community/whisper-base.en",
+			"whisper-small": "onnx-community/whisper-small",
+			"whisper-small.en": "onnx-community/whisper-small.en"
+		}[modelId] || modelId;
+	};
 	useEffect(() => {
 		if (!shouldLoad || isReady) return;
 		let cancelled = false;
@@ -2172,18 +2366,29 @@ function useVoiceChat(options = {}) {
 				setIsLoading(true);
 				setError(null);
 				setLoadingMessage("Loading speech recognition (Whisper)...");
-				const { WhisperSTT } = await import("../stt-Dne6SENv.js");
-				if (cancelled || !mountedRef.current) return;
-				const stt = new WhisperSTT(sttModel);
-				await stt.load({ onProgress: (p) => {
-					if (!mountedRef.current) return;
-					setLoadingMessage(p.status || "Loading STT...");
-				} });
+				const sttWorker = createSTTWorker();
+				if (cancelled || !mountedRef.current) {
+					sttWorker.terminate();
+					return;
+				}
+				await new Promise((resolve, reject) => {
+					sttWorker.onmessage = (e) => {
+						const { type, payload } = e.data;
+						if (type === "ready") resolve();
+						if (type === "error") reject(new Error(payload));
+						if (type === "progress" && mountedRef.current) setLoadingMessage(payload.status || "Loading STT...");
+					};
+					sttWorker.onerror = (e) => reject(new Error(e.message));
+					sttWorker.postMessage({
+						type: "load",
+						payload: { model: resolveSTTModel(sttModel) }
+					});
+				});
 				if (cancelled || !mountedRef.current) {
-					stt.dispose();
+					sttWorker.terminate();
 					return;
 				}
-				sttRef.current = stt;
+				sttRef.current = sttWorker;
 				setLoadingMessage("Loading language model...");
 				const worker = await createGerbilWorker({
 					modelId: llmModel,
@@ -2198,18 +2403,34 @@ function useVoiceChat(options = {}) {
 				}
 				llmWorkerRef.current = worker;
 				setLoadingMessage(`Loading text-to-speech (${ttsModelId === "supertonic-66m" ? "Supertonic" : "Kokoro"})...`);
-				const { createTTS } = await import("../tts-C2FzKuSx.js");
-				if (cancelled || !mountedRef.current) return;
-				const tts = createTTS(ttsModelId);
-				await tts.load({ onProgress: (p) => {
-					if (!mountedRef.current) return;
-					setLoadingMessage(p.status || "Loading TTS...");
-				} });
+				const ttsWorker = createTTSWorker();
+				if (cancelled || !mountedRef.current) {
+					ttsWorker.terminate();
+					return;
+				}
+				const ttsConfig$1 = TTS_MODELS[ttsModelId];
+				await new Promise((resolve, reject) => {
+					ttsWorker.onmessage = (e) => {
+						const { type, payload } = e.data;
+						if (type === "ready") resolve();
+						if (type === "error") reject(new Error(payload));
+						if (type === "progress" && mountedRef.current) setLoadingMessage(payload.status || "Loading TTS...");
+					};
+					ttsWorker.onerror = (e) => reject(new Error(e.message));
+					ttsWorker.postMessage({
+						type: "load",
+						payload: {
+							modelId: ttsModelId,
+							repo: ttsConfig$1.repo,
+							voices: ttsConfig$1.voices
+						}
+					});
+				});
 				if (cancelled || !mountedRef.current) {
-					await tts.dispose();
+					ttsWorker.terminate();
 					return;
 				}
-				ttsRef.current = tts;
+				ttsRef.current = ttsWorker;
 				setIsReady(true);
 				setIsLoading(false);
 				setLoadingMessage("Ready!");
@@ -2238,8 +2459,8 @@ function useVoiceChat(options = {}) {
 		return () => {
 			mountedRef.current = false;
 			llmWorkerRef.current?.terminate();
-			sttRef.current?.dispose();
-			ttsRef.current?.dispose();
+			sttRef.current?.terminate();
+			ttsRef.current?.terminate();
 			if (streamRef.current) for (const track of streamRef.current.getTracks()) track.stop();
 			audioContextRef.current?.close();
 		};
@@ -2345,7 +2566,26 @@ function useVoiceChat(options = {}) {
 					try {
 						setStage("transcribing");
 						const audioData = await blobToFloat32(audioBlob);
-						let userText = (await sttRef.current.transcribe(audioData)).text.trim();
+						let userText = await new Promise((sttResolve, sttReject) => {
+							const handler = (e) => {
+								const { type, payload } = e.data;
+								if (type === "transcript") {
+									sttRef.current?.removeEventListener("message", handler);
+									sttResolve(payload);
+								}
+								if (type === "error") {
+									sttRef.current?.removeEventListener("message", handler);
+									sttReject(new Error(payload));
+								}
+							};
+							sttRef.current?.addEventListener("message", handler);
+							const audioArray = new Float32Array(audioData);
+							sttRef.current?.postMessage({
+								type: "transcribe",
+								payload: { audio: audioArray }
+							}, [audioArray.buffer]);
+						});
+						userText = userText.trim();
 						if (userText === "[BLANK_AUDIO]" || userText === "(blank audio)" || userText === "[BLANK AUDIO]") userText = "";
 						if (cancelledRef.current || !userText) {
 							setStage("idle");
@@ -2395,9 +2635,30 @@ function useVoiceChat(options = {}) {
 						onAssistantSpeak?.(responseText);
 						if (responseText.trim()) {
 							setStage("speaking");
-							const ttsResult = await ttsRef.current.speak(responseText, {
-								voice,
-								speed
+							const ttsResult = await new Promise((ttsResolve, ttsReject) => {
+								const handler = (e) => {
+									const { type, payload } = e.data;
+									if (type === "audio") {
+										ttsRef.current?.removeEventListener("message", handler);
+										ttsResolve({
+											audio: payload.audio,
+											sampleRate: payload.sampleRate
+										});
+									}
+									if (type === "error") {
+										ttsRef.current?.removeEventListener("message", handler);
+										ttsReject(new Error(payload));
+									}
+								};
+								ttsRef.current?.addEventListener("message", handler);
+								ttsRef.current?.postMessage({
+									type: "generate",
+									payload: {
+										text: responseText,
+										voice,
+										speed
+									}
+								});
 							});
 							if (!cancelledRef.current) await playAudioBuffer(ttsResult.audio, ttsResult.sampleRate);
 						}