npm - @tritard/waterbrother - Versions diffs - 0.14.12 → 0.14.13 - Mend

@tritard/waterbrother 0.14.12 → 0.14.13

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (3) hide show

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@tritard/waterbrother",
-  "version": "0.14.12",
+  "version": "0.14.13",
   "description": "Waterbrother: Grok-powered coding CLI with local tools, sessions, operator modes, and approval controls",
   "type": "module",
   "bin": {

package/src/cli.js CHANGED Viewed

@@ -172,7 +172,8 @@ const INTERACTIVE_COMMANDS = [
   { name: "/feedback", description: "Report a bug or share feedback" },
   { name: "/cost", description: "Show session token usage and cost breakdown" },
   { name: "/diff", description: "Show git changes in the current repo" },
-  { name: "/voice", description: "Toggle voice dictation (press space to record)" }
+  { name: "/voice", description: "Toggle voice dictation (press space to record)" },
+  { name: "/speak", description: "Toggle TTS — agent reads responses aloud (esc to stop)" }
 ];
 const AGENT_PROFILES = ["coder", "designer", "reviewer", "planner"];
@@ -4133,6 +4134,15 @@ async function runTextTurnInteractive({
   });
   printAssistantOutput(renderedAssistantText);
+  // TTS: speak the response when speak mode is active
+  if (context.speakModeEnabled && context.voiceSession?.hasTts?.()) {
+    context.voiceSession.speakFull(renderedAssistantText, {
+      apiKey: context.runtime.apiKey,
+      baseUrl: context.runtime.baseUrl,
+    });
+  }
   await setSessionRunState(currentSession, agent, "done");
   printTurnSummary(turnSummary, response, { modelId: agent.getModel(), costTracker: context.costTracker, traceMode: context.runtime.traceMode });
   printTraceTimeline(turnSummary, context.runtime.traceMode);
@@ -4469,6 +4479,7 @@ async function readInteractiveLine(options = {}) {
   return new Promise((resolve, reject) => {
     let buffer = "";
+    let cursorPos = 0;
     let selectedIndex = 0;
     let settled = false;
     let ignoredPastePrintable = 0;
@@ -4478,6 +4489,38 @@ async function readInteractiveLine(options = {}) {
     // Voice recording state
     let voiceRecording = false;
     let voiceIndicator = "";
+    // Tracks the colored region for voice-dictated text
+    // { start, length, state: 'raw'|'corrected', sweepPos: number (chars already swept green) }
+    let voiceSegment = null;
+    let voiceSegmentTimer = null;
+    let voiceSweepTimer = null;
+    function sweepToGreen(start, length) {
+      if (voiceSweepTimer) clearInterval(voiceSweepTimer);
+      if (!voiceSegment) return;
+      voiceSegment.sweepPos = 0;
+      voiceSegment.state = "corrected";
+      const text = buffer.slice(start, start + length);
+      const words = text.split(/(?<=\s)/); // split keeping spaces
+      let charsDone = 0;
+      let wordIdx = 0;
+      voiceSweepTimer = setInterval(() => {
+        if (settled || wordIdx >= words.length) {
+          clearInterval(voiceSweepTimer);
+          voiceSweepTimer = null;
+          if (voiceSegment) voiceSegment.sweepPos = length;
+          render();
+          // Fade to normal after sweep completes
+          if (voiceSegmentTimer) clearTimeout(voiceSegmentTimer);
+          voiceSegmentTimer = setTimeout(() => { voiceSegment = null; render(); }, 2000);
+          return;
+        }
+        charsDone += words[wordIdx].length;
+        wordIdx++;
+        if (voiceSegment) voiceSegment.sweepPos = charsDone;
+        render();
+      }, 80);
+    }
     function finish(nextValue) {
       if (settled) return;
@@ -4501,7 +4544,24 @@ async function readInteractiveLine(options = {}) {
         selectedIndex = 0;
       }
-      const displayBuffer = voiceIndicator ? `${buffer} ${voiceIndicator}` : buffer;
+      // Apply visual coloring to voice-dictated text segments
+      let coloredBuffer = buffer;
+      if (voiceSegment && voiceSegment.start < buffer.length) {
+        const s = voiceSegment;
+        const before = buffer.slice(0, s.start);
+        const seg = buffer.slice(s.start, s.start + s.length);
+        const after = buffer.slice(s.start + s.length);
+        if (s.state === "corrected" && typeof s.sweepPos === "number" && s.sweepPos < s.length) {
+          // Sweep: green for swept portion, magenta for remaining
+          const swept = seg.slice(0, s.sweepPos);
+          const remaining = seg.slice(s.sweepPos);
+          coloredBuffer = `${before}\x1b[32m${swept}\x1b[35m${remaining}\x1b[0m${after}`;
+        } else {
+          const color = s.state === "corrected" ? "\x1b[32m" : "\x1b[35m";
+          coloredBuffer = `${before}${color}${seg}\x1b[0m${after}`;
+        }
+      }
+      const displayBuffer = voiceIndicator ? `${coloredBuffer} ${voiceIndicator}` : coloredBuffer;
       const writePrompt = () => {
         output.write(formatPromptRow(displayBuffer, columns));
       };
@@ -4539,6 +4599,13 @@ async function readInteractiveLine(options = {}) {
         output.write("\r");
         writePrompt();
       }
+      // Position terminal cursor at cursorPos within the buffer
+      if (cursorPos < buffer.length) {
+        const prefixLen = USER_PREFIX.length + 1; // "you> " visible chars
+        const col = prefixLen + cursorPos + 1; // 1-based column
+        output.write(`\x1b[${col}G`);
+      }
     }
     function cleanup() {
@@ -4560,6 +4627,7 @@ async function readInteractiveLine(options = {}) {
         if (!isExact || buffer === "/") {
           if (suggestionHasArgs(selected.name)) {
             buffer = nextValue;
+            cursorPos = buffer.length;
             selectedIndex = 0;
             render();
             return;
@@ -4593,6 +4661,12 @@ async function readInteractiveLine(options = {}) {
         return;
       }
+      // Escape: stop TTS playback if speaking
+      if (key?.name === "escape" && voiceSession?.isSpeaking?.()) {
+        voiceSession.stopSpeaking();
+        return;
+      }
       if (key?.name === "return" || key?.name === "enter" || str === "\n" || str === "\r") {
         if (suppressSubmit) return;
         handleSubmit();
@@ -4600,14 +4674,44 @@ async function readInteractiveLine(options = {}) {
       }
       if (key?.name === "backspace") {
-        if (buffer.length > 0) {
-          buffer = buffer.slice(0, -1);
+        if (cursorPos > 0) {
+          buffer = buffer.slice(0, cursorPos - 1) + buffer.slice(cursorPos);
+          cursorPos--;
+          selectedIndex = 0;
+          render();
+        }
+        return;
+      }
+      if (key?.name === "delete") {
+        if (cursorPos < buffer.length) {
+          buffer = buffer.slice(0, cursorPos) + buffer.slice(cursorPos + 1);
           selectedIndex = 0;
           render();
         }
         return;
       }
+      if (key?.name === "left") {
+        if (cursorPos > 0) { cursorPos--; render(); }
+        return;
+      }
+      if (key?.name === "right") {
+        if (cursorPos < buffer.length) { cursorPos++; render(); }
+        return;
+      }
+      if (key?.name === "home" || (key?.ctrl && key?.name === "a")) {
+        if (cursorPos > 0) { cursorPos = 0; render(); }
+        return;
+      }
+      if (key?.name === "end" || (key?.ctrl && key?.name === "e")) {
+        if (cursorPos < buffer.length) { cursorPos = buffer.length; render(); }
+        return;
+      }
       if (key?.name === "up") {
         const suggestions = getSlashMenuSuggestions(buffer);
         if (suggestions.length > 0) {
@@ -4631,68 +4735,163 @@ async function readInteractiveLine(options = {}) {
         if (suggestions.length > 0) {
           const selected = suggestions[selectedIndex >= 0 ? selectedIndex : 0];
           buffer = commandInputFromSuggestion(selected.name);
+          cursorPos = buffer.length;
           selectedIndex = 0;
           render();
         }
         return;
       }
-      // Voice: spacebar on empty/trailing-space triggers a 5-second recording.
-      // Uses fixed duration with clean sox exit — same code path as test-capture.mjs.
-      if (voiceSession && !voiceRecording && str === " " && (buffer.length === 0 || buffer.endsWith(" "))) {
-        voiceRecording = true;
-        voiceIndicator = "\x1b[31m[recording 5s — speak now]\x1b[0m";
+      // Voice: spacebar stops active streaming recording
+      if (voiceSession && voiceRecording && voiceSession.hasStreaming?.() && str === " ") {
+        voiceRecording = false;
+        voiceIndicator = "\x1b[36m[finalizing...]\x1b[0m";
         render();
         (async () => {
           try {
-            const result = await voiceSession.recordAndTranscribe(5);
-            voiceRecording = false;
+            const result = await voiceSession.stopStreaming();
+              if (result && typeof result === "object" && result.error) {
+                voiceIndicator = `\x1b[31m[${result.error}]\x1b[0m`;
+                render();
+                setTimeout(() => { voiceIndicator = ""; render(); }, 6000);
+                return;
+              }
-            if (result && typeof result === "object" && result.error) {
-              voiceIndicator = `\x1b[31m[${result.error}]\x1b[0m`;
+              const finalText = typeof result === "string" ? result : "";
+              voiceIndicator = "";
+              if (!finalText) { render(); return; }
+              // Replace streaming preview with Moonshine final result
+              const insertPoint = voiceSegment ? voiceSegment.start : buffer.length;
+              const prevLength = voiceSegment ? voiceSegment.length : 0;
+              const before = buffer.slice(0, insertPoint);
+              const after = buffer.slice(insertPoint + prevLength);
+              buffer = before + finalText + after;
+              if (voiceSegmentTimer) clearTimeout(voiceSegmentTimer);
+              voiceSegment = { start: insertPoint, length: finalText.length, state: "raw" };
+              cursorPos = insertPoint + finalText.length;
               render();
-              setTimeout(() => { voiceIndicator = ""; render(); }, 6000);
-              return;
-            }
-            const rawText = typeof result === "string" ? result : "";
-            voiceIndicator = "";
-            if (!rawText) {
+              // Fire Grok correction with visual sweep
+              if (grokConfig && grokConfig.apiKey) {
+                voiceIndicator = "\x1b[36m[correcting...]\x1b[0m";
+                render();
+                voiceSession.correctTranscript(finalText, grokConfig).then((corrected) => {
+                  voiceIndicator = "";
+                  if (settled) return;
+                  const textToSweep = (corrected && corrected !== finalText) ? corrected : finalText;
+                  if (corrected && corrected !== finalText) {
+                    const b = buffer.slice(0, insertPoint);
+                    const a = buffer.slice(insertPoint + finalText.length);
+                    buffer = b + corrected + a;
+                    cursorPos = insertPoint + corrected.length;
+                  }
+                  voiceSegment = { start: insertPoint, length: textToSweep.length, state: "raw" };
+                  render();
+                  sweepToGreen(insertPoint, textToSweep.length);
+                });
+              } else {
+                // No Grok — sweep the Moonshine result directly
+                sweepToGreen(insertPoint, finalText.length);
+              }
+            } catch (err) {
+              voiceIndicator = `\x1b[31m[voice error: ${err.message || err}]\x1b[0m`;
               render();
-              return;
+              setTimeout(() => { voiceIndicator = ""; render(); }, 6000);
             }
+          })();
+          return;
+      }
-            const insertPoint = buffer.length;
-            buffer += rawText;
+      // Voice: spacebar starts recording (only on empty/trailing-space at end of buffer)
+      if (voiceSession && !voiceRecording && str === " " && cursorPos === buffer.length && (buffer.length === 0 || buffer.endsWith(" "))) {
+        voiceRecording = true;
+        const insertPoint = buffer.length;
+        if (voiceSegmentTimer) clearTimeout(voiceSegmentTimer);
+        if (voiceSweepTimer) { clearInterval(voiceSweepTimer); voiceSweepTimer = null; }
+        let lastStreamRender = 0;
+        let streamRenderPending = null;
+        // Try streaming mode first, fall back to batch
+        const streamingStarted = voiceSession.hasStreaming?.() && voiceSession.startStreaming((partialText) => {
+          // Live update: replace voice segment with streaming partial result
+          const before = buffer.slice(0, insertPoint);
+          const prevLength = voiceSegment ? voiceSegment.length : 0;
+          const after = buffer.slice(insertPoint + prevLength);
+          buffer = before + partialText + after;
+          voiceSegment = { start: insertPoint, length: partialText.length, state: "raw" };
+          cursorPos = insertPoint + partialText.length;
+          // Throttle renders to max ~8fps to prevent flicker
+          const now = Date.now();
+          if (now - lastStreamRender >= 120) {
+            lastStreamRender = now;
+            if (streamRenderPending) { clearTimeout(streamRenderPending); streamRenderPending = null; }
             render();
+          } else if (!streamRenderPending) {
+            streamRenderPending = setTimeout(() => { streamRenderPending = null; lastStreamRender = Date.now(); render(); }, 120);
+          }
+        });
-            if (grokConfig && grokConfig.apiKey) {
-              voiceIndicator = "\x1b[36m[correcting...]\x1b[0m";
+        if (streamingStarted) {
+          voiceIndicator = "\x1b[31m[recording — space to stop]\x1b[0m";
+          voiceSegment = { start: insertPoint, length: 0, state: "raw" };
+          render();
+        } else {
+          // Batch fallback (no streaming recognizer)
+          voiceIndicator = "\x1b[31m[recording 5s — speak now]\x1b[0m";
+          render();
+          (async () => {
+            try {
+              const result = await voiceSession.recordAndTranscribe(5);
+              voiceRecording = false;
+              if (result && typeof result === "object" && result.error) {
+                voiceIndicator = `\x1b[31m[${result.error}]\x1b[0m`;
+                render();
+                setTimeout(() => { voiceIndicator = ""; render(); }, 6000);
+                return;
+              }
+              const rawText = typeof result === "string" ? result : "";
+              voiceIndicator = "";
+              if (!rawText) { render(); return; }
+              buffer += rawText;
+              voiceSegment = { start: insertPoint, length: rawText.length, state: "raw" };
+              cursorPos = buffer.length;
               render();
-              voiceSession.correctTranscript(rawText, grokConfig).then((corrected) => {
-                voiceIndicator = "";
-                if (settled) return;
-                if (corrected && corrected !== rawText) {
-                  const before = buffer.slice(0, insertPoint);
-                  const after = buffer.slice(insertPoint + rawText.length);
-                  buffer = before + corrected + after;
-                }
+              if (grokConfig && grokConfig.apiKey) {
+                voiceIndicator = "\x1b[36m[correcting...]\x1b[0m";
                 render();
-              });
+                voiceSession.correctTranscript(rawText, grokConfig).then((corrected) => {
+                  voiceIndicator = "";
+                  if (settled) return;
+                  const textToSweep = (corrected && corrected !== rawText) ? corrected : rawText;
+                  if (corrected && corrected !== rawText) {
+                    const b = buffer.slice(0, insertPoint);
+                    const a = buffer.slice(insertPoint + rawText.length);
+                    buffer = b + corrected + a;
+                    cursorPos = insertPoint + corrected.length;
+                  }
+                  voiceSegment = { start: insertPoint, length: textToSweep.length, state: "raw" };
+                  render();
+                  sweepToGreen(insertPoint, textToSweep.length);
+                });
+              } else {
+                sweepToGreen(insertPoint, rawText.length);
+              }
+            } catch (err) {
+              voiceIndicator = `\x1b[31m[voice error: ${err.message || err}]\x1b[0m`;
+              voiceRecording = false;
+              render();
+              setTimeout(() => { voiceIndicator = ""; render(); }, 6000);
             }
-          } catch (err) {
-            voiceIndicator = `\x1b[31m[voice error: ${err.message || err}]\x1b[0m`;
-            voiceRecording = false;
-            render();
-            setTimeout(() => { voiceIndicator = ""; render(); }, 6000);
-          }
-        })();
+          })();
+        }
         return;
       }
       if (isPrintableKey(str, key)) {
-        buffer += str;
+        buffer = buffer.slice(0, cursorPos) + str + buffer.slice(cursorPos);
+        cursorPos += str.length;
         selectedIndex = 0;
         render();
@@ -4733,7 +4932,8 @@ async function readInteractiveLine(options = {}) {
         if (looksLikePastedBlock) {
           if (normalized) {
-            buffer += normalized;
+            buffer = buffer.slice(0, cursorPos) + normalized + buffer.slice(cursorPos);
+            cursorPos += normalized.length;
             selectedIndex = 0;
             render();
           }
@@ -4758,8 +4958,9 @@ async function readInteractiveLine(options = {}) {
             return;
           }
           if (ch === "\u007f" || ch === "\b") {
-            if (buffer.length > 0) {
-              buffer = buffer.slice(0, -1);
+            if (cursorPos > 0) {
+              buffer = buffer.slice(0, cursorPos - 1) + buffer.slice(cursorPos);
+              cursorPos--;
               selectedIndex = 0;
               render();
             }
@@ -4767,7 +4968,8 @@ async function readInteractiveLine(options = {}) {
           }
           if (ch.charCodeAt(0) < 32 || ch.charCodeAt(0) === 127) continue;
           if (ch.includes("\x1b")) continue;
-          buffer += ch;
+          buffer = buffer.slice(0, cursorPos) + ch + buffer.slice(cursorPos);
+          cursorPos++;
           selectedIndex = 0;
           render();
         }
@@ -5066,8 +5268,9 @@ async function promptLoop(agent, session, context) {
   }
   async function handleNaturalInput(line) {
-    // Product builder intake: detect "I want a recipe app" in any mode
-    if (detectProductRequest(line)) {
+    // Product builder intake: only in standard/guide mode. Expert mode uses cockpit.
+    const currentMode = agent.getExperienceMode();
+    if (detectProductRequest(line) && currentMode !== "expert") {
       const intent = parseProductIntent(line);
       // Extract everything from the request — no interactive prompts (stdin is unreliable on Windows)
@@ -5513,8 +5716,14 @@ Be concrete about surfaces — name actual pages/flows. Choose the best stack fo
             lastUsage: context.lastUsage,
             costTracker: context.costTracker
           });
-          if (context.voiceModeEnabled) {
-            return "Voice ON — space to record, space to stop | " + footer;
+          const modes = [];
+          if (context.voiceModeEnabled) modes.push("Voice ON");
+          if (context.speakModeEnabled) modes.push("Speak ON");
+          if (modes.length > 0) {
+            const hints = [];
+            if (context.voiceModeEnabled) hints.push("space to record/stop");
+            if (context.speakModeEnabled) hints.push("esc stops speech");
+            return `${modes.join(" | ")} — ${hints.join(", ")} | ${footer}`;
           }
           return footer;
         },
@@ -7292,18 +7501,49 @@ Be concrete about surfaces — name actual pages/flows. Choose the best stack fo
             context.voiceSession = await setupVoice((msg) => console.log(msg));
           }
           context.voiceModeEnabled = true;
-          console.log("Voice mode ON. Press spacebar to record (5 seconds).");
+          console.log("Voice dictation ON. Press spacebar to record, spacebar to stop.");
           console.log(dim("Tip: Grok will auto-correct technical terms after transcription."));
         } catch (error) {
           console.log(`Voice mode failed: ${error instanceof Error ? error.message : String(error)}`);
         }
       } else {
         context.voiceModeEnabled = false;
-        if (context.voiceSession) {
+        // Only tear down session if speak mode is also off
+        if (!context.speakModeEnabled && context.voiceSession) {
+          context.voiceSession.destroy();
+          context.voiceSession = null;
+        }
+        console.log("Voice dictation OFF.");
+      }
+      continue;
+    }
+    if (line === "/speak") {
+      if (!context.speakModeEnabled) {
+        try {
+          if (!context.runtime.apiKey) {
+            console.log("Speak mode requires an xAI API key. Set your API key first.");
+            continue;
+          }
+          // Voice session is needed for sox playback; set up if not already done
+          if (!context.voiceSession) {
+            const { setupVoice } = await import("./voice.js");
+            context.voiceSession = await setupVoice((msg) => console.log(msg));
+          }
+          context.speakModeEnabled = true;
+          console.log("Speak mode ON. Agent will read responses aloud via xAI.");
+          console.log(dim("Tip: Press Escape to stop speech mid-sentence."));
+        } catch (error) {
+          console.log(`Speak mode failed: ${error instanceof Error ? error.message : String(error)}`);
+        }
+      } else {
+        context.speakModeEnabled = false;
+        // Only tear down session if voice mode is also off
+        if (!context.voiceModeEnabled && context.voiceSession) {
           context.voiceSession.destroy();
           context.voiceSession = null;
         }
-        console.log("Voice mode OFF.");
+        console.log("Speak mode OFF.");
       }
       continue;
     }

package/src/voice.js CHANGED Viewed

@@ -13,8 +13,9 @@ const execFileAsync = promisify(execFile);
 // Paths
 // ---------------------------------------------------------------------------
-const MODEL_DIR_NAME = "sherpa-onnx-moonshine-base-en-int8";
-const MODEL_FILES = [
+// STT model (Moonshine)
+const STT_MODEL_DIR_NAME = "sherpa-onnx-moonshine-base-en-int8";
+const STT_MODEL_FILES = [
   "preprocess.onnx",
   "encode.int8.onnx",
   "uncached_decode.int8.onnx",
@@ -22,15 +23,33 @@ const MODEL_FILES = [
   "tokens.txt"
 ];
+// Streaming STT model (Zipformer 20M)
+const STREAM_MODEL_DIR_NAME = "sherpa-onnx-streaming-zipformer-en-20M-2023-02-17";
+const STREAM_MODEL_FILES = [
+  "encoder-epoch-99-avg-1.int8.onnx",
+  "decoder-epoch-99-avg-1.int8.onnx",
+  "joiner-epoch-99-avg-1.int8.onnx",
+  "tokens.txt"
+];
+// xAI TTS API
+const XAI_TTS_URL = "https://api.x.ai/v1/tts";
 function getWaterbrotherHome() {
   const home = process.env.HOME || process.env.USERPROFILE || "";
   return path.join(home, ".waterbrother");
 }
-function getModelsDir() {
-  return path.join(getWaterbrotherHome(), "models", MODEL_DIR_NAME);
+function getSttModelsDir() {
+  return path.join(getWaterbrotherHome(), "models", STT_MODEL_DIR_NAME);
+}
+function getStreamModelsDir() {
+  return path.join(getWaterbrotherHome(), "models", STREAM_MODEL_DIR_NAME);
 }
 function getVoiceRuntimeDir() {
   return path.join(getWaterbrotherHome(), "voice-runtime");
 }
@@ -94,31 +113,49 @@ async function checkSherpaOnnx() {
   }
 }
-async function checkModel() {
-  const dir = getModelsDir();
+async function checkSttModel() {
+  const dir = getSttModelsDir();
+  try {
+    const entries = await fs.readdir(dir);
+    const missing = STT_MODEL_FILES.filter((f) => !entries.includes(f));
+    return { ok: missing.length === 0, dir, missing };
+  } catch {
+    return { ok: false, dir, missing: STT_MODEL_FILES };
+  }
+}
+async function checkStreamModel() {
+  const dir = getStreamModelsDir();
   try {
     const entries = await fs.readdir(dir);
-    const missing = MODEL_FILES.filter((f) => !entries.includes(f));
+    const missing = STREAM_MODEL_FILES.filter((f) => !entries.includes(f));
     return { ok: missing.length === 0, dir, missing };
   } catch {
-    return { ok: false, dir, missing: MODEL_FILES };
+    return { ok: false, dir, missing: STREAM_MODEL_FILES };
   }
 }
 // ---------------------------------------------------------------------------
 // Model download
 // ---------------------------------------------------------------------------
-const MODEL_ARCHIVE_URL =
-  `https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/${MODEL_DIR_NAME}.tar.bz2`;
+const STT_MODEL_ARCHIVE_URL =
+  `https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/${STT_MODEL_DIR_NAME}.tar.bz2`;
-async function downloadModel(onProgress) {
-  const modelsRoot = path.dirname(getModelsDir());
+const STREAM_MODEL_ARCHIVE_URL =
+  `https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/${STREAM_MODEL_DIR_NAME}.tar.bz2`;
+async function downloadArchive(archiveUrl, dirName, onProgress) {
+  const modelsRoot = path.join(getWaterbrotherHome(), "models");
   await fs.mkdir(modelsRoot, { recursive: true });
   if (onProgress) onProgress({ status: "downloading" });
-  const response = await fetch(MODEL_ARCHIVE_URL, { redirect: "follow" });
+  const response = await fetch(archiveUrl, { redirect: "follow" });
   if (!response.ok) {
     throw new Error(`Failed to download model archive: HTTP ${response.status}`);
   }
@@ -139,7 +176,7 @@ async function downloadModel(onProgress) {
   }
   // Write archive to temp file, then extract
-  const archivePath = path.join(modelsRoot, `${MODEL_DIR_NAME}.tar.bz2`);
+  const archivePath = path.join(modelsRoot, `${dirName}.tar.bz2`);
   const archiveBuffer = Buffer.concat(chunks);
   await fs.writeFile(archivePath, archiveBuffer);
   if (onProgress) onProgress({ status: "extracting" });
@@ -174,7 +211,7 @@ function createRecognizer() {
   const sherpa = _sherpaOnnx;
   if (!sherpa) throw new Error("sherpa-onnx-node not loaded");
-  const dir = getModelsDir();
+  const dir = getSttModelsDir();
   const config = {
     modelConfig: {
       moonshine: {
@@ -193,6 +230,35 @@ function createRecognizer() {
   return new sherpa.OfflineRecognizer(config);
 }
+function createStreamingRecognizer() {
+  const sherpa = _sherpaOnnx;
+  if (!sherpa) throw new Error("sherpa-onnx-node not loaded");
+  const dir = getStreamModelsDir();
+  const config = {
+    featConfig: { sampleRate: 16000, featureDim: 80 },
+    modelConfig: {
+      transducer: {
+        encoder: path.join(dir, "encoder-epoch-99-avg-1.int8.onnx"),
+        decoder: path.join(dir, "decoder-epoch-99-avg-1.int8.onnx"),
+        joiner: path.join(dir, "joiner-epoch-99-avg-1.int8.onnx"),
+      },
+      tokens: path.join(dir, "tokens.txt"),
+      numThreads: 2,
+      provider: "cpu",
+      debug: 0,
+    },
+    decodingMethod: "greedy_search",
+    enableEndpoint: true,
+    rule1MinTrailingSilence: 2.4,
+    rule2MinTrailingSilence: 1.2,
+    rule3MinUtteranceLength: 20,
+  };
+  return new sherpa.OnlineRecognizer(config);
+}
 // ---------------------------------------------------------------------------
 // Audio device detection (Windows)
 // ---------------------------------------------------------------------------
@@ -450,32 +516,50 @@ export async function setupVoice(onStatus) {
   }
   log("  sherpa-onnx: ready");
-  // 3. Model — auto-download if missing
-  const model = await checkModel();
-  if (!model.ok) {
-    log("  Downloading Moonshine Base model (~250 MB)...");
-    await downloadModel(({ status, downloaded, total, size }) => {
-      if (status === "progress" && total > 0) {
-        const pct = Math.round((downloaded / total) * 100);
-        process.stdout.write(`\r    ${pct}% (${formatBytes(downloaded)}/${formatBytes(total)})`);
-      } else if (status === "extracting") {
-        process.stdout.write(`\r    Extracting...                              \n`);
-      } else if (status === "done") {
-        log(`    Done (${formatBytes(size)})`);
-      }
-    });
-    log("  Model ready.");
+  // 3. STT model — auto-download if missing
+  const sttModel = await checkSttModel();
+  if (!sttModel.ok) {
+    log("  Downloading Moonshine Base STT model (~250 MB)...");
+    await downloadArchive(STT_MODEL_ARCHIVE_URL, STT_MODEL_DIR_NAME, downloadProgressHandler(log));
+    log("  STT model ready.");
   } else {
     log("  Moonshine Base: ready");
   }
-  // 4. Detect audio device (Windows)
+  // 4. Streaming STT model — auto-download if missing
+  const streamModel = await checkStreamModel();
+  if (!streamModel.ok) {
+    log("  Downloading Zipformer streaming STT model (~122 MB)...");
+    await downloadArchive(STREAM_MODEL_ARCHIVE_URL, STREAM_MODEL_DIR_NAME, downloadProgressHandler(log));
+    log("  Streaming STT model ready.");
+  } else {
+    log("  Zipformer streaming: ready");
+  }
+  // 5. Detect audio device (Windows)
   const soxPath = sox.path;
   const audioDevice = await detectAudioDevice(soxPath, log);
-  // 5. Initialize recognizer
+  // 6. Initialize recognizers
   await loadSherpaOnnx();
   const recognizer = createRecognizer();
+  let streamingRecognizer = null;
+  try {
+    streamingRecognizer = createStreamingRecognizer();
+    log("  Zipformer streaming: initialized");
+  } catch (err) {
+    log(`  Zipformer streaming: failed (${err.message}) — falling back to batch mode`);
+  }
+  log("  TTS: xAI voice API (requires /speak + API key)");
+  // Active streaming recording state
+  let _streamingSox = null;
+  let _streamingStream = null;
+  let _streamingChunks = [];
+  // TTS playback state
+  let _ttsPlayback = null;
+  let _ttsCancelled = false;
   return {
     // Record for a fixed duration (sox exits cleanly, no kill).
@@ -492,11 +576,186 @@ export async function setupVoice(onStatus) {
       return { error: `No speech detected (${durationMs}ms, amp=${maxAmp.toFixed(4)})` };
     },
+    // Start streaming recognition. onPartial(text) fires as words are recognized.
+    // Returns true if streaming started, false if falling back to batch.
+    startStreaming(onPartial) {
+      if (!streamingRecognizer) return false;
+      _streamingChunks = [];
+      _streamingStream = streamingRecognizer.createStream();
+      const isWin = process.platform === "win32";
+      const inputArgs = isWin
+        ? ["-t", "waveaudio", audioDevice || "default"]
+        : ["-d"];
+      const args = [
+        ...inputArgs,
+        "-t", "raw", "-r", "16000", "-c", "1", "-b", "16", "-e", "signed-integer",
+        "-"
+      ];
+      _streamingSox = spawn(soxPath, args, { stdio: ["ignore", "pipe", "ignore"] });
+      _streamingSox.stdout.on("data", (chunk) => {
+        _streamingChunks.push(chunk);
+        const samples = new Float32Array(Math.floor(chunk.length / 2));
+        for (let i = 0; i < samples.length; i++) {
+          samples[i] = chunk.readInt16LE(i * 2) / 32768.0;
+        }
+        _streamingStream.acceptWaveform({ sampleRate: 16000, samples });
+        while (streamingRecognizer.isReady(_streamingStream)) {
+          streamingRecognizer.decode(_streamingStream);
+        }
+        const text = streamingRecognizer.getResult(_streamingStream).text.trim();
+        if (text) onPartial(text);
+      });
+      return true;
+    },
+    // Stop streaming and finalize with Moonshine for accuracy.
+    // Returns final text or { error: "..." }.
+    async stopStreaming() {
+      if (_streamingSox) {
+        _streamingSox.kill();
+        _streamingSox = null;
+      }
+      if (_streamingStream) {
+        streamingRecognizer.reset(_streamingStream);
+        _streamingStream = null;
+      }
+      // Combine all captured chunks and run Moonshine for final accuracy
+      if (_streamingChunks.length === 0) {
+        return { error: "No audio captured" };
+      }
+      const fullBuffer = Buffer.concat(_streamingChunks);
+      _streamingChunks = [];
+      const samples = new Float32Array(Math.floor(fullBuffer.length / 2));
+      for (let i = 0; i < samples.length; i++) {
+        samples[i] = fullBuffer.readInt16LE(i * 2) / 32768.0;
+      }
+      const durationMs = Math.round((samples.length / 16000) * 1000);
+      let maxAmp = 0;
+      for (const v of samples) { const a = Math.abs(v); if (a > maxAmp) maxAmp = a; }
+      const text = transcribe(recognizer, samples);
+      if (text) return text;
+      if (samples.length < 1600) return { error: `Recording too short (${durationMs}ms)` };
+      if (maxAmp < 0.01) return { error: `Silence (${durationMs}ms, amp=${maxAmp.toFixed(4)}) — mic not active` };
+      return { error: `No speech detected (${durationMs}ms, amp=${maxAmp.toFixed(4)})` };
+    },
+    hasStreaming() { return streamingRecognizer !== null; },
     async correctTranscript(rawText, grokConfig) {
       return correctTranscript(rawText, grokConfig);
     },
-    destroy() {}
+    // Speak full text via xAI TTS API. Streams MP3 to temp file, plays via sox.
+    // Can be cancelled via stopSpeaking(). Requires grokConfig with apiKey.
+    async speakFull(text, { apiKey, baseUrl } = {}) {
+      if (!text || !apiKey) return;
+      _ttsCancelled = false;
+      // Strip ANSI, markdown, code blocks, emojis
+      let clean = text.replace(/\x1b\[[0-9;]*m/g, "").replace(/```[\s\S]*?```/g, "").replace(/`[^`]+`/g, "");
+      clean = clean.replace(/[#*_~>]/g, "");
+      clean = clean.replace(/[\u{1F600}-\u{1F64F}\u{1F300}-\u{1F5FF}\u{1F680}-\u{1F6FF}\u{1F1E0}-\u{1F1FF}\u{2600}-\u{27BF}\u{2B50}\u{2B55}\u{231A}-\u{23F3}\u{23CF}\u{200D}\u{FE0F}\u{20E3}\u{E0020}-\u{E007F}\u{1F900}-\u{1F9FF}\u{1FA00}-\u{1FA6F}\u{1FA70}-\u{1FAFF}]/gu, "");
+      const lines = clean.split("\n").map((l) => l.trim()).filter(Boolean);
+      const prose = lines.filter((l) => !(/^[/\\+\-@{]/.test(l) || /^\d+[:|]/.test(l) || l.length < 3));
+      const fullText = prose.join(". ");
+      if (!fullText.trim()) return;
+      try {
+        const response = await fetch(XAI_TTS_URL, {
+          method: "POST",
+          headers: {
+            "Authorization": `Bearer ${apiKey}`,
+            "Content-Type": "application/json",
+          },
+          body: JSON.stringify({
+            text: fullText,
+            voice_id: "eve",
+            language: "en",
+          }),
+        });
+        if (!response.ok) return;
+        if (_ttsCancelled) return;
+        const audioBuffer = Buffer.from(await response.arrayBuffer());
+        if (_ttsCancelled || !audioBuffer.length) return;
+        const tmpDir = path.join(getWaterbrotherHome(), "tmp");
+        await fs.mkdir(tmpDir, { recursive: true });
+        const ts = Date.now();
+        const mp3Path = path.join(tmpDir, `tts-${ts}.mp3`);
+        await fs.writeFile(mp3Path, audioBuffer);
+        if (_ttsCancelled) { fs.unlink(mp3Path).catch(() => {}); return; }
+        // Play MP3 — platform-native players
+        const cleanupFiles = [mp3Path];
+        let playCmd, playArgs;
+        if (process.platform === "darwin") {
+          playCmd = "afplay";
+          playArgs = [mp3Path];
+        } else if (process.platform === "win32") {
+          // PowerShell MediaPlayer — write temp .ps1 to avoid escaping issues
+          const psPath = path.join(tmpDir, `tts-${ts}.ps1`);
+          await fs.writeFile(psPath, [
+            "Add-Type -AssemblyName PresentationCore",
+            "$p = New-Object System.Windows.Media.MediaPlayer",
+            `$p.Open([uri]"${mp3Path.replace(/\\/g, "/")}")`,
+            "$p.Play()",
+            "Start-Sleep -Milliseconds 500",
+            "while($p.Position -lt $p.NaturalDuration.TimeSpan){ Start-Sleep -Milliseconds 200 }",
+            "$p.Close()",
+          ].join("\n"));
+          cleanupFiles.push(psPath);
+          playCmd = "powershell.exe";
+          playArgs = ["-NoProfile", "-ExecutionPolicy", "Bypass", "-File", psPath];
+        } else {
+          playCmd = "mpv";
+          playArgs = ["--no-video", "--really-quiet", mp3Path];
+        }
+        await new Promise((resolve) => {
+          const child = spawn(playCmd, playArgs, { stdio: "ignore" });
+          _ttsPlayback = child;
+          child.on("exit", () => {
+            if (_ttsPlayback === child) _ttsPlayback = null;
+            for (const f of cleanupFiles) fs.unlink(f).catch(() => {});
+            resolve();
+          });
+          child.on("error", () => { resolve(); });
+        });
+      } catch {
+        // TTS failed — silently ignore
+      }
+      _ttsCancelled = false;
+    },
+    // Stop any in-progress speech playback.
+    stopSpeaking() {
+      _ttsCancelled = true;
+      if (_ttsPlayback) {
+        try { _ttsPlayback.kill(); } catch {}
+        _ttsPlayback = null;
+      }
+    },
+    isSpeaking() { return _ttsPlayback !== null; },
+    hasTts() { return true; },
+    destroy() {
+      if (_streamingSox) { _streamingSox.kill(); _streamingSox = null; }
+      this.stopSpeaking();
+    }
   };
 }
@@ -504,6 +763,19 @@ export async function setupVoice(onStatus) {
 // Helpers
 // ---------------------------------------------------------------------------
+function downloadProgressHandler(log) {
+  return ({ status, downloaded, total, size }) => {
+    if (status === "progress" && total > 0) {
+      const pct = Math.round((downloaded / total) * 100);
+      process.stdout.write(`\r    ${pct}% (${formatBytes(downloaded)}/${formatBytes(total)})`);
+    } else if (status === "extracting") {
+      process.stdout.write(`\r    Extracting...                              \n`);
+    } else if (status === "done") {
+      log(`    Done (${formatBytes(size)})`);
+    }
+  };
+}
 function formatBytes(bytes) {
   if (bytes < 1024) return `${bytes} B`;
   if (bytes < 1024 * 1024) return `${(bytes / 1024).toFixed(1)} KB`;