npm - @agentprojectcontext/apx - Versions diffs - 1.27.2 → 1.28.0 - Mend

@agentprojectcontext/apx 1.27.2 → 1.28.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

package/package.json +1 -1
package/src/host/daemon/api/transcribe.js +12 -0
package/src/host/daemon/plugins/desktop.js +34 -12
package/src/host/daemon/transcription.js +29 -0
package/src/host/daemon/whisper-server.py +11 -0
package/src/interfaces/desktop/main.js +48 -5
package/src/interfaces/desktop/preload.js +10 -4
package/src/interfaces/desktop/renderer.js +354 -119

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@agentprojectcontext/apx",
-  "version": "1.27.2",
+  "version": "1.28.0",
   "description": "APX — unified CLI + daemon for the Agent Project Context (APC) standard.",
   "publishConfig": {
     "access": "public"

package/src/host/daemon/api/transcribe.js CHANGED Viewed

@@ -6,6 +6,18 @@
 //
 // Shared by overlay, telegram voice messages, and any external caller.
 export function register(app) {
+  // GET /transcribe/warmup — load the local whisper model (if needed) and reset
+  // its idle watchdog. Callers (e.g. the desktop window) ping this while open so
+  // the first real utterance doesn't pay the cold-load cost.
+  app.get("/transcribe/warmup", async (_req, res) => {
+    try {
+      const { warmupWhisper } = await import("../transcription.js");
+      res.json(await warmupWhisper());
+    } catch (e) {
+      res.status(500).json({ ok: false, error: e.message });
+    }
+  });
   app.post("/transcribe/chunk", async (req, res) => {
     const chunks = [];
     req.on("data", (c) => chunks.push(c));

package/src/host/daemon/plugins/desktop.js CHANGED Viewed

@@ -100,9 +100,27 @@ async function _handleMessage({ ws, text, previousMessages }, { projects, config
     await appendGlobalMessage({ channel: CHANNEL, direction: "in", type: "user", author: "user", body: text });
   } catch {}
-  let fullResponse = "";
   let toolsExecuted = [];
+  // Per-segment streaming: instead of merging the whole turn into one blob, we
+  // emit each assistant text piece as its own `segment` (an intro before a tool,
+  // then the post-tool answer, …). The renderer renders each as its own bubble
+  // and synthesizes its own audio, so a multi-step reply reads as separate spoken
+  // messages instead of one run-on bubble. `liveBuf` accumulates streamed tokens
+  // (streaming engines) so they can be flushed as a segment at each boundary;
+  // for non-streaming models like gemini the text arrives whole via events.
+  let segSeq = 0;
+  let lastSegText = "";
+  let liveBuf = "";
+  const emittedSegments = [];
+  const emitSegment = (raw) => {
+    const seg = (raw || "").trim();
+    if (!seg || seg === lastSegText) return;
+    lastSegText = seg;
+    emittedSegments.push(seg);
+    _send(ws, { type: "segment", seq: ++segSeq, text: seg });
+  };
   try {
     if (!isSuperAgentEnabled(config)) {
       throw new Error("super-agent not enabled — set super_agent.enabled + super_agent.model in ~/.apx/config.json");
@@ -120,10 +138,7 @@ async function _handleMessage({ ws, text, previousMessages }, { projects, config
       previousMessages: history.slice(0, -1),
       overrideModel: cfg.model || null,
       signal: controller.signal,
-      onToken: (chunk) => {
-        fullResponse += chunk;
-        _send(ws, { type: "token", text: chunk });
-      },
+      onToken: (chunk) => { liveBuf += chunk; },
       onEvent: async (event) => {
         if (event.type === "tool_start") {
           const t = event.trace;
@@ -131,17 +146,24 @@ async function _handleMessage({ ws, text, previousMessages }, { projects, config
           _send(ws, { type: "tool_start", name: t.tool, args: t.args });
         } else if (event.type === "tool_result") {
           _send(ws, { type: "tool_done", name: event.trace.tool });
-        } else if (event.type === "assistant_text" && event.text && !fullResponse) {
-          _send(ws, { type: "token", text: event.text });
-          fullResponse += event.text;
+        } else if (event.type === "assistant_text" && event.text) {
+          // A complete assistant text segment (e.g. the "I'll check…" intro
+          // emitted right before a tool runs). Ship it as its own message.
+          emitSegment(event.text);
+          liveBuf = "";
         }
       },
     });
-    const finalText = fullResponse || result.text || "";
-    log(`desktop: super-agent turn done in ${Date.now() - t0}ms text_len=${finalText.length}`);
+    // The final (no-tool) iteration's answer appears ONLY in result.text (or, for
+    // streaming engines, in liveBuf) — it's never emitted as an event. Ship it as
+    // the closing segment (deduped against the last one).
+    emitSegment((result.text || "").trim() || liveBuf.trim());
+    const finalText = emittedSegments.join("\n\n");
+    log(`desktop: super-agent turn done in ${Date.now() - t0}ms segments=${segSeq} text_len=${finalText.length} tools=${toolsExecuted.length}`);
-    // Emit done with full text
-    _send(ws, { type: "done", text: finalText });
+    // Turn end. `segments` lets the renderer know how many bubbles to expect.
+    _send(ws, { type: "done", segments: segSeq, text: finalText });
     // Append assistant turn to history
     if (ws && histories) {

package/src/host/daemon/transcription.js CHANGED Viewed

@@ -481,6 +481,35 @@ export async function preloadWhisperServer(log = console.log) {
   }
 }
+/**
+ * Keep the local whisper server warm. Ensures it's loaded and pings /health,
+ * which resets the server's idle watchdog so a live session (e.g. the desktop
+ * window held open) never pays the cold-load cost on the next utterance.
+ * Cheap and safe to call repeatedly. Never throws.
+ * Returns { ok, model?, loaded?, provider } for the caller to surface.
+ */
+export async function warmupWhisper() {
+  try {
+    const cfg = await getConfig();
+    if (cfg.provider === "openai") return { ok: true, provider: "openai", loaded: false };
+    await ensureWhisperServer(cfg.local);
+    // /warmup loads the model into RAM (lazy otherwise) AND touches _last_used,
+    // resetting the idle timer. First call may block ~15-30s on a cold model;
+    // instant once warm. Generous timeout so the cold load can finish.
+    let loaded = false;
+    try {
+      const r = await fetch(`http://127.0.0.1:${WHISPER_PORT}/warmup`, {
+        signal: AbortSignal.timeout(40_000),
+      });
+      const j = await r.json().catch(() => ({}));
+      loaded = !!j.loaded;
+    } catch {}
+    return { ok: true, provider: "local", model: _serverModel, loaded };
+  } catch (e) {
+    return { ok: false, error: e.message };
+  }
+}
 /**
  * Stop the whisper server we own (no-op if we adopted an external one).
  */

package/src/host/daemon/whisper-server.py CHANGED Viewed

@@ -94,6 +94,17 @@ class _Handler(BaseHTTPRequestHandler):
                 "model": _model_name or _Handler.model_name,
                 "loaded": _model is not None,
             })
+        elif self.path == "/warmup":
+            # Eagerly load the model into RAM (no audio needed) and reset the
+            # idle timer, so the first real transcription isn't cold. Blocks
+            # until the model is loaded the first time; instant once warm.
+            _touch()
+            with _model_lock:
+                try:
+                    _load_model_if_needed(_Handler.model_name, _Handler.device, _Handler.compute_type)
+                    self._send_json(200, {"ok": True, "loaded": _model is not None, "model": _model_name})
+                except Exception as e:
+                    self._send_json(500, {"ok": False, "error": f"model load failed: {e}"})
         else:
             self._send_json(404, {"ok": False, "error": "not found"})

package/src/interfaces/desktop/main.js CHANGED Viewed

@@ -46,6 +46,20 @@ function getShortcut() {
   return cfg?.desktop?.shortcut || cfg?.overlay?.shortcut || DEFAULT_SHORTCUT;
 }
+// Voice-capture timing for the listening capsule. Overridable in config.json:
+//   "desktop": { "silence_ms": 1200, "voice_rms": 0.025 }
+// silence_ms — quiet after speech before auto-send. voice_rms — RMS above
+// which audio counts as voice (lower = more sensitive).
+function getVoiceTiming() {
+  const cfg = readApxConfig();
+  const d = cfg?.desktop || cfg?.overlay || {};
+  const num = (v, def) => (typeof v === "number" && isFinite(v) ? v : def);
+  return {
+    silence_ms: Math.max(400, num(d.silence_ms, 1200)),
+    voice_rms:  Math.max(0,   num(d.voice_rms,  0.025)),
+  };
+}
 function readToken() {
   try { return fs.readFileSync(TOKEN_PATH, "utf8").trim(); } catch { return ""; }
 }
@@ -397,6 +411,7 @@ ipcMain.handle("get-shortcut", () => getShortcut());
 ipcMain.handle("get-theme",    () => getTheme());
 ipcMain.handle("get-position", () => getPosition());
 ipcMain.handle("get-agent-name", () => getAgentName());
+ipcMain.handle("get-voice-timing", () => getVoiceTiming());
 // Renderer asks main to grow/shrink the window to fit its content.
 // Clamped to [WIN_H_MIN, getMaxWindowHeight()]; same anchor (top edge stays put).
@@ -411,27 +426,29 @@ ipcMain.on("resize-window", (_e, { height }) => {
 // Renderer asks for TTS playback of the agent reply. We synthesize via the
 // daemon and pipe the audio path back as a daemon-event the renderer already
 // knows how to consume (tts-ready { url, duration } / tts-failed).
-ipcMain.handle("request-tts", async (_e, { text }) => {
+ipcMain.handle("request-tts", async (_e, { text, seg }) => {
   if (!text || !text.trim()) {
-    mainWindow?.webContents.send("daemon-event", { type: "tts-failed" });
+    mainWindow?.webContents.send("daemon-event", { type: "tts-failed", seg });
     return;
   }
   try {
     const result = await daemonTtsSay(text);
     if (result?.ok && result.audio_path) {
       // Expose the local file via file:// — preload's contextIsolation lets
-      // the renderer's <audio> tag fetch it directly.
+      // the renderer's <audio> tag fetch it directly. `seg` ties this audio to
+      // the bubble that asked for it.
       const url = "file://" + result.audio_path;
       mainWindow?.webContents.send("daemon-event", {
         type: "tts-ready",
+        seg,
         url,
         duration: result.duration_s || 0,
       });
     } else {
-      mainWindow?.webContents.send("daemon-event", { type: "tts-failed", error: result?.error || "no audio" });
+      mainWindow?.webContents.send("daemon-event", { type: "tts-failed", seg, error: result?.error || "no audio" });
     }
   } catch (e) {
-    mainWindow?.webContents.send("daemon-event", { type: "tts-failed", error: e.message });
+    mainWindow?.webContents.send("daemon-event", { type: "tts-failed", seg, error: e.message });
   }
 });
@@ -462,6 +479,32 @@ ipcMain.handle("check-whisper-ready", () => {
   });
 });
+// Renderer asks to keep STT warm. Routed through the daemon (not whisper
+// directly) so it both LOADS the model if it idled out and resets the idle
+// watchdog. Fire-and-forget from the renderer's side.
+ipcMain.handle("warmup-stt", async () => {
+  return new Promise((resolve) => {
+    const token = readToken();
+    const options = {
+      hostname: DAEMON_HOST,
+      port: DAEMON_PORT,
+      path: "/transcribe/warmup",
+      method: "GET",
+      headers: { ...(token ? { "Authorization": `Bearer ${token}` } : {}) },
+    };
+    const req = http.request(options, (res) => {
+      let data = "";
+      res.on("data", (c) => data += c);
+      res.on("end", () => { try { resolve(JSON.parse(data)); } catch { resolve({ ok: false }); } });
+    });
+    req.on("error", () => resolve({ ok: false }));
+    // Cold model load can take ~30s; give it room. (Renderer fires this
+    // fire-and-forget, so a long warm-up never blocks the UI.)
+    req.setTimeout(45000, () => { req.destroy(); resolve({ ok: false }); });
+    req.end();
+  });
+});
 // Renderer requests recording toggle (ESC cancels, shortcut toggles)
 ipcMain.handle("toggle-recording", async () => {
   if (isRecording) stopRecording(); else startRecording();

package/src/interfaces/desktop/preload.js CHANGED Viewed

@@ -18,14 +18,19 @@ contextBridge.exposeInMainWorld("apx", {
   // Check if the whisper model is loaded (false = still loading)
   checkWhisperReady: () => ipcRenderer.invoke("check-whisper-ready"),
+  // Keep STT warm (loads the model if idle + resets the idle timer). Called
+  // while the window is open / on mic-open so the first decode isn't cold.
+  warmupStt: () => ipcRenderer.invoke("warmup-stt").catch(() => ({ ok: false })),
   // Send final text to daemon
   sendMessage: (text, previousMessages) =>
     ipcRenderer.invoke("send-message", { text, previousMessages }),
-  // After "done", ask main to synthesize TTS. Returns true if main will reply
-  // with a tts-ready / tts-failed daemon-event; false if TTS is not wired.
-  requestTts: (text) => {
-    ipcRenderer.invoke("request-tts", { text }).catch(() => {});
+  // Ask main to synthesize TTS for one segment. `seg` correlates the resulting
+  // tts-ready/tts-failed event back to the bubble that requested it (each
+  // assistant message has its own audio). Returns true optimistically.
+  requestTts: (text, seg) => {
+    ipcRenderer.invoke("request-tts", { text, seg }).catch(() => {});
     return true; // optimistic; renderer waits for the event either way
   },
@@ -42,6 +47,7 @@ contextBridge.exposeInMainWorld("apx", {
   getTheme:     () => ipcRenderer.invoke("get-theme"),
   getPosition:  () => ipcRenderer.invoke("get-position"),
   getAgentName: () => ipcRenderer.invoke("get-agent-name"),
+  getVoiceTiming: () => ipcRenderer.invoke("get-voice-timing"),
   // Renderer asks main to resize the BrowserWindow to the rendered height
   resize: (height) => ipcRenderer.send("resize-window", { height }),

package/src/interfaces/desktop/renderer.js CHANGED Viewed

@@ -31,15 +31,49 @@
   let recorderFormat = "webm";
   let liveBusy = false;
+  // Mic is async to open (getUserMedia + recorder warm-up). Until it's actually
+  // capturing we show a "Cargando…" state instead of the wave, so the user
+  // doesn't talk into the dead gap before the recorder starts.
+  let micReady = false;
+  // Silence auto-send: once speech has been heard, SILENCE_MS of quiet
+  // auto-commits the recording. RMS (time-domain) is the voice/silence gate.
+  // Both are overridable from config.json (desktop.silence_ms / voice_rms).
+  let speechSeen = false;
+  let lastVoiceTs = 0;
+  let SILENCE_MS = 1200;        // quiet after speech → send on its own
+  let VOICE_RMS  = 0.025;       // RMS above this counts as voice (0 = silence)
+  const PAUSE_PREVIEW_MS = 600; // a short pause kicks ONE decode (reused on send)
+  // When a pause triggers a preview decode, that decode already covers all the
+  // speech (the tail is just trailing silence), so the auto-send reuses it
+  // instead of paying a second full decode. These coordinate that handoff.
+  let pausePreviewed = false;   // a preview decode fired for the current pause
+  let reuseLiveOnStop = false;  // commit should reuse pendingUserText, not re-decode
+  let livePromise = null;       // in-flight preview decode (awaited on reuse)
   // Web Audio analyser — drives the live capsule wave from real mic amplitude
   let audioCtx = null;
   let analyser = null;
   let freqData = null;
+  let timeData = null;
   let waveRaf = null;
-  let streamingAgentEntry = null; // { id, role:'agent', el, ... } during thinking/speaking
-  let toolPillsByName = {};       // active tool pills inside the streaming bubble row
-  let ttsAudio = null;            // <audio> playing the agent reply
+  let streamingAgentEntry = null; // legacy single-bubble streaming (kept dormant)
+  let toolPillsByName = {};       // active tool pills, by tool name, for the live turn
+  let ttsAudio = null;            // <audio> currently playing
+  // ── Per-segment turn rendering ──────────────────────────────────────────
+  // A turn is now N agent message bubbles (intro, post-tool answer, …), each
+  // with its own audio. `currentTurn` tags every bubble of a turn so regen can
+  // drop the whole turn. The audio queue plays segment audios in seq order
+  // (gapless auto-play), waiting at the cursor for each segment's TTS to land.
+  let currentTurn = 0;
+  let turnAudios = [];            // [{ m, ready, failed, played }] ordered by seq
+  let audioCursor = 0;            // index of the next segment to play
+  let queuePlaying = false;       // a segment audio is currently playing
+  let turnDone = false;           // `done` received for the active turn
+  let turnWatchdog = null;        // flushes the queue if a segment's TTS hangs
   let history = [];               // [{role:'user'|'assistant', content}] sent to daemon for context
   let theme = "light";
@@ -119,10 +153,15 @@
     window.apx?.getPosition?.()  ?? "right",
     window.apx?.getShortcut?.()  ?? "CommandOrControl+G",
     window.apx?.getAgentName?.() ?? "Superagente",
-  ]).then(([th, pos, shortcut, name]) => {
+    window.apx?.getVoiceTiming?.() ?? null,
+  ]).then(([th, pos, shortcut, name, timing]) => {
     theme = th || "light";
     position = pos || "right";
     agentName = (name && String(name).trim()) || "Superagente";
+    if (timing) {
+      if (typeof timing.silence_ms === "number") SILENCE_MS = timing.silence_ms;
+      if (typeof timing.voice_rms === "number")  VOICE_RMS  = timing.voice_rms;
+    }
     document.documentElement.setAttribute("data-theme", theme);
     setPosition(position);
     initialCaption(shortcut);
@@ -217,6 +256,13 @@
         }
       }
       // else: input already there → leave it alone (preserves focus + caret)
+    } else if (mode === "listening" && !micReady) {
+      // Mic still opening (getUserMedia + recorder warm-up). Show a loading
+      // status so the user waits for capture instead of talking into the gap.
+      if ($capCenter.dataset.mode !== "loading") {
+        $capCenter.dataset.mode = "loading";
+        $capCenter.innerHTML = `<span class="status"><span class="dots"><i></i><i></i><i></i></span><span class="shimmer">Cargando…</span></span>`;
+      }
     } else if (mode === "listening") {
       // Only rebuild the wave if it's not already there (avoids restarting
       // CSS animations / Web Audio binding every render).
@@ -246,9 +292,10 @@
         }
       }
     }
-    // Clear data-mode when we're back to idle/listening so a future busy mode
-    // re-renders correctly.
-    if (mode === "idle" || mode === "listening") $capCenter.dataset.mode = "";
+    // Clear data-mode when we're back to idle, or once the live wave is up, so
+    // a future busy mode re-renders correctly. While the mic is still warming
+    // up we keep the "loading" marker so "Cargando…" isn't rebuilt every frame.
+    if (mode === "idle" || (mode === "listening" && micReady)) $capCenter.dataset.mode = "";
     // actions
     $capActions.innerHTML = "";
@@ -273,7 +320,8 @@
       }
     } else if (mode === "listening") {
       addBtn("ghost", "Cancelar", ICON.x(), () => cancel());
-      addBtn("", "Enviar", ICON.send(), () => stopListening(/* commit */ true));
+      // No "Enviar" until the recorder is live — nothing to send mid-warm-up.
+      if (micReady) addBtn("", "Enviar", ICON.send(), () => stopListening(/* commit */ true));
     } else if (mode === "transcribing") {
       addBtn("ghost", "Cancelar", ICON.x(), () => cancel());
     } else if (mode === "thinking") {
@@ -321,7 +369,6 @@
       // Re-render all existing turns
       messages.forEach((m, i) => appendTurn(m, i === messages.length - 1));
       if (mode === "transcribing") renderPendingUserPartial();
-      if (mode === "thinking" || mode === "speaking") ensureStreamingAgentBubble();
     }
   }
@@ -388,7 +435,10 @@
         if (history.length && history[history.length - 1].role === "assistant") {
           history.pop();
         }
-        messages = messages.filter((x) => x.id !== m.id);
+        // A turn can be several agent bubbles (intro + post-tool answer…); drop
+        // them all so regen replaces the whole turn, not just the last segment.
+        const turnId = m.turn;
+        messages = messages.filter((x) => !(x.role === "agent" && turnId != null && x.turn === turnId) && x.id !== m.id);
         rebuildConvFromState();
         startAgentTurn();
         sendToDaemon(lastUser.text);
@@ -499,12 +549,14 @@
   }
   function addToolPill(name) {
-    ensureStreamingAgentBubble();
-    if (toolPillsByName[name]) return;
+    ensureConv();
+    if (!$convScroll || toolPillsByName[name]) return;
     const pill = document.createElement("div");
     pill.className = "tool-pill";
     pill.innerHTML = `<div class="spinner"></div><span>${escapeHtml(name)}</span>`;
-    $convScroll.insertBefore(pill, streamingAgentEntry.el);
+    // Append at the end of the conversation flow — pills sit between the
+    // segment bubbles in the order tools actually run.
+    $convScroll.appendChild(pill);
     toolPillsByName[name] = pill;
     scrollConvToBottom();
   }
@@ -541,53 +593,46 @@
     const dur   = m.dur || 1;
     const fmt   = (s) => `0:${String(Math.round(s)).padStart(2, "0")}`;
     const audio = new Audio(m.audio);
+    m._audioEl = audio;          // the audio queue drives sequential playback
     let raf = null;
-    let progress = 0;
     const setProgress = (p) => {
-      progress = Math.max(0, Math.min(1, p));
-      const cur = Math.floor(progress * N);
+      p = Math.max(0, Math.min(1, p));
+      const cur = Math.floor(p * N);
       bars.forEach((b, i) => {
         b.classList.toggle("on", i <= cur);
         b.classList.toggle("cur", i === cur && !audio.paused);
       });
-      $dur.textContent = progress > 0 || !audio.paused ? fmt(progress * dur) : fmt(dur);
+      $dur.textContent = p > 0 || !audio.paused ? fmt(p * dur) : fmt(dur);
     };
     const tick = () => {
       if (audio.duration > 0) setProgress(audio.currentTime / audio.duration);
       raf = requestAnimationFrame(tick);
     };
-    audio.addEventListener("play",   () => { $play.innerHTML = ICON.pause(); raf = requestAnimationFrame(tick); mode = "speaking"; render(); });
-    audio.addEventListener("pause",  () => { $play.innerHTML = ICON.play();  if (raf) cancelAnimationFrame(raf); if (mode === "speaking") { mode = "idle"; render(); } });
-    audio.addEventListener("ended",  () => { setProgress(1); if (mode === "speaking") { mode = "idle"; render(); } });
+    audio.addEventListener("play",  () => { $play.innerHTML = ICON.pause(); raf = requestAnimationFrame(tick); if (mode !== "speaking") { mode = "speaking"; render(); } });
+    audio.addEventListener("pause", () => { $play.innerHTML = ICON.play();  if (raf) cancelAnimationFrame(raf); });
+    audio.addEventListener("ended", () => { $play.innerHTML = ICON.play(); if (raf) cancelAnimationFrame(raf); setProgress(1); onSegmentEnded(m); });
+    // 404 / decode error / autoplay block: don't hang — advance the queue.
+    audio.addEventListener("error", () => onSegmentEnded(m));
-    $play.addEventListener("click", () => audio.paused ? audio.play() : audio.pause());
+    $play.addEventListener("click", () => {
+      if (audio.paused) {
+        // Manual play takes control — stop the auto-sequence so we don't fight it.
+        queuePlaying = false;
+        try { if (ttsAudio && ttsAudio !== audio && !ttsAudio.ended) ttsAudio.pause(); } catch {}
+        ttsAudio = audio;
+        audio.play().catch(() => { if (mode === "speaking") { mode = "idle"; render(); } });
+      } else {
+        audio.pause();
+        if (mode === "speaking") { mode = "idle"; render(); }
+      }
+    });
     $bar.addEventListener("click", (e) => {
       const r = $bar.getBoundingClientRect();
       const p = Math.max(0, Math.min(1, (e.clientX - r.left) / r.width));
       if (audio.duration > 0) audio.currentTime = p * audio.duration;
       setProgress(p);
     });
-    // If the audio errors out (404, decode error, autoplay block, etc) make
-    // sure the capsule doesn't stay stuck in "está hablando…".
-    audio.addEventListener("error", () => {
-      if (mode === "speaking") { mode = "idle"; render(); }
-    });
-    // autoplay if it's the fresh reply
-    if (m.fresh) {
-      m.fresh = false;
-      ttsAudio?.pause?.();
-      ttsAudio = audio;
-      audio.play().catch(() => {
-        // Autoplay block (rare in Electron with user-gesture but possible
-        // when the window has never been focused). Bail out so the capsule
-        // returns to idle and the user can still tap "play" on the scrubber.
-        if (mode === "speaking" || mode === "thinking") { mode = "idle"; render(); }
-      });
-    }
   }
   // Post-finalize hook: add a scrubber to an already-rendered agent turn
@@ -597,15 +642,18 @@
     if (!m) return;
     m.audio = url;
     m.dur   = dur || 0;
-    m.fresh = true; // autoplay the freshly-arrived reply
     const turnEl = $convScroll?.querySelector(`[data-id="${turnId}"]`);
-    if (!turnEl) return;
-    // Insert the scrubber HTML just before turn-actions (matches appendTurn order).
-    const actions = turnEl.querySelector(".turn-actions");
-    const html = buildScrubberHtml(m);
-    if (actions) actions.insertAdjacentHTML("beforebegin", html);
-    else turnEl.insertAdjacentHTML("beforeend", html);
-    wireScrubber(turnEl, m);
+    if (turnEl && !turnEl.querySelector(".audio")) {
+      // Insert the scrubber HTML just before turn-actions (matches appendTurn).
+      const actions = turnEl.querySelector(".turn-actions");
+      const html = buildScrubberHtml(m);
+      if (actions) actions.insertAdjacentHTML("beforebegin", html);
+      else turnEl.insertAdjacentHTML("beforeend", html);
+      wireScrubber(turnEl, m); // sets m._audioEl
+    }
+    // Audio is ready → let the sequential queue play it when it's this
+    // segment's turn (gapless auto-play across the turn's bubbles).
+    queueMarkReady(m);
     scrollConvToBottom();
   }
@@ -620,10 +668,94 @@
     return out;
   }
+  // ── Per-turn setup + sequential audio queue ──────────────────────────────
+  // Each turn renders N agent bubbles (segments), each with its own audio. We
+  // play those audios in `seq` order, gaplessly: the cursor waits at a segment
+  // until its TTS lands, plays it, then advances. So Roby "speaks" its messages
+  // one after another even though they synthesize at different speeds.
+  function beginAgentTurn() {
+    currentTurn++;
+    resetTurnAudio();
+    doneHandled = false;
+    pendingTtsTurnId = null;
+    if (ttsTimer) { clearTimeout(ttsTimer); ttsTimer = null; }
+  }
+  function resetTurnAudio() {
+    try { ttsAudio?.pause?.(); } catch {}
+    ttsAudio = null;
+    turnAudios = [];
+    audioCursor = 0;
+    queuePlaying = false;
+    turnDone = false;
+    if (turnWatchdog) { clearTimeout(turnWatchdog); turnWatchdog = null; }
+  }
+  function queueRegisterSegment(m) {
+    if (!turnAudios.some((e) => e.m === m)) {
+      turnAudios.push({ m, ready: false, failed: false, played: false });
+      turnAudios.sort((a, b) => (a.m.seq || 0) - (b.m.seq || 0));
+    }
+  }
+  function queueMarkReady(m) {
+    const e = turnAudios.find((x) => x.m === m);
+    if (e) e.ready = true;
+    pumpAudioQueue();
+  }
+  function queueMarkFailed(m) {
+    const e = turnAudios.find((x) => x.m === m);
+    if (e) { e.ready = true; e.failed = true; e.played = true; }
+    pumpAudioQueue();
+  }
+  function pumpAudioQueue() {
+    if (queuePlaying) return;
+    while (audioCursor < turnAudios.length) {
+      const e = turnAudios[audioCursor];
+      if (!e.ready) return;                           // wait for this segment's TTS
+      if (e.played || e.failed || !e.m._audioEl) { audioCursor++; continue; }
+      const audio = e.m._audioEl;
+      queuePlaying = true;
+      try { if (ttsAudio && ttsAudio !== audio && !ttsAudio.ended) ttsAudio.pause(); } catch {}
+      ttsAudio = audio;
+      audio.play().catch(() => {                       // autoplay blocked / decode error
+        queuePlaying = false;
+        e.played = true;
+        audioCursor++;
+        pumpAudioQueue();
+      });
+      return;
+    }
+    // Drained. Once the turn is done and nothing's left, return to idle.
+    if (turnDone) {
+      if (turnWatchdog) { clearTimeout(turnWatchdog); turnWatchdog = null; }
+      if (mode === "speaking" || mode === "thinking") { mode = "idle"; render(); }
+    }
+  }
+  // Called from a segment audio's `ended` (or `error`). Advances the queue.
+  function onSegmentEnded(m) {
+    const e = turnAudios.find((x) => x.m === m);
+    if (e) { if (e.played) return; e.played = true; }
+    if (queuePlaying && ttsAudio === m._audioEl) {
+      queuePlaying = false;
+      audioCursor++;
+      pumpAudioQueue();
+    } else if (mode === "speaking") {
+      mode = "idle"; render();
+    }
+  }
   // ── Recording flow ───────────────────────────────────────────────────────
   function startListening() {
     if (mode !== "idle") return;
     isCancelled = false;
+    micReady = false;      // show "Cargando…" until the recorder is actually live
+    speechSeen = false;
+    lastVoiceTs = 0;
+    pausePreviewed = false;
+    reuseLiveOnStop = false;
+    livePromise = null;
+    pendingUserText = "";
+    // Warm the whisper model now (overlaps the mic warm-up), so the decode at
+    // the end of this utterance doesn't pay a cold start.
+    window.apx?.warmupStt?.();
     mode = "listening";
     render();
     startMic();
@@ -649,6 +781,7 @@
     if (mode === "listening") { stopMic(); }
     if (mode === "thinking" || mode === "speaking") { window.apx?.cancel?.(); }
     removePendingUserPartial();
+    resetTurnAudio();   // stop any playing/queued segment audio
     if (streamingAgentEntry) {
       streamingAgentEntry.el.remove();
       streamingAgentEntry = null;
@@ -658,6 +791,8 @@
   }
   function stopSpeaking() {
+    // Halt the auto-sequence and the current segment.
+    queuePlaying = false;
     try { ttsAudio?.pause?.(); } catch {}
     if (mode === "speaking") { mode = "idle"; render(); }
   }
@@ -678,6 +813,7 @@
         analyser.maxDecibels = -15;                   // ceiling (loud speech)
         src.connect(analyser);
         freqData = new Uint8Array(analyser.frequencyBinCount);
+        timeData = new Uint8Array(analyser.fftSize);
         startWaveLoop();
       } catch (e) {
         console.warn("desktop renderer: AnalyserNode init failed", e);
@@ -691,14 +827,27 @@
       recordedChunks = [];
       mediaRecorder = new MediaRecorder(audioStream, { mimeType, audioBitsPerSecond: 32000 });
       mediaRecorder.ondataavailable = (e) => {
+        // Just buffer. We deliberately do NOT decode on every chunk anymore —
+        // re-decoding the growing clip every 2s serialized on the single
+        // whisper thread and the final decode queued behind it (the old ~10s
+        // stall). Transcription now happens once, on a pause / on stop.
         if (e.data && e.data.size > 0) recordedChunks.push(e.data);
-        runLivePartial();
       };
       mediaRecorder.onstop = async () => {
         if (isCancelled) { recordedChunks = []; if (mode !== "idle") { mode = "idle"; render(); } return; }
-        const raw = await transcribeBuffered();
-        const text = (raw || "").trim();
+        let text = "";
+        // Auto-send after a pause: the pause already kicked a full decode that
+        // covers all the speech (the only thing after it is trailing silence),
+        // so reuse it instead of decoding the same audio again. Await the
+        // in-flight preview if it hasn't settled yet.
+        if (reuseLiveOnStop) {
+          if (livePromise) { try { await livePromise; } catch {} }
+          text = (pendingUserText || "").trim();
+        }
+        // Manual send (Enviar / ⌘G release) or no preview yet → one fresh decode.
+        if (!text) text = (await transcribeBuffered()).trim();
         recordedChunks = [];
+        reuseLiveOnStop = false;
         // Guard with .trim() — whisper occasionally returns a single space or
         // newline for very short clips, which used to commit an empty bubble.
         if (!text || isCancelled) {
@@ -711,7 +860,16 @@
         pendingUserText = text;
         commitUserMessage(text, /* via */ "voice");
       };
-      mediaRecorder.start(2000);
+      // 1s timeslice: chunks land often enough that a pause-preview decode has
+      // audio to work with even for short utterances. We no longer decode per
+      // chunk (just buffer), so a smaller slice is essentially free.
+      mediaRecorder.start(1000);
+      // Recorder is now live → swap "Cargando…" for the reactive wave and let
+      // silence detection arm. lastVoiceTs starts now so a fully silent open
+      // won't auto-send (speechSeen gates that).
+      micReady = true;
+      lastVoiceTs = Date.now();
+      if (mode === "listening") render();
     } catch (e) {
       console.error("desktop renderer: mic error", e);
       mode = "idle";
@@ -723,11 +881,16 @@
     try { audioStream?.getTracks().forEach((t) => t.stop()); } catch {}
     mediaRecorder = null;
     audioStream = null;
+    micReady = false;
+    speechSeen = false;
+    lastVoiceTs = 0;
+    pausePreviewed = false;
     stopWaveLoop();
     try { audioCtx?.close(); } catch {}
     audioCtx = null;
     analyser = null;
     freqData = null;
+    timeData = null;
   }
   // ── Reactive wave: amplitude-driven bar heights (runs while mode === listening)
@@ -738,6 +901,43 @@
     const tick = () => {
       if (mode !== "listening" || !analyser) { waveRaf = null; return; }
       analyser.getByteFrequencyData(freqData);
+      // ── Silence auto-send ──────────────────────────────────────────────
+      // Time-domain RMS is a reliable voice/silence gate (unlike the freq
+      // bars, it's independent of the analyser's dB scaling). Once we've heard
+      // speech, SILENCE_MS of quiet commits the recording on its own.
+      if (micReady && timeData) {
+        analyser.getByteTimeDomainData(timeData);
+        let sumSq = 0;
+        for (let i = 0; i < timeData.length; i++) {
+          const v = (timeData[i] - 128) / 128;
+          sumSq += v * v;
+        }
+        const rms = Math.sqrt(sumSq / timeData.length);
+        const now = Date.now();
+        if (rms > VOICE_RMS) {
+          speechSeen = true;
+          lastVoiceTs = now;
+          pausePreviewed = false;            // new speech → allow a fresh preview
+        } else if (speechSeen && lastVoiceTs) {
+          const silentFor = now - lastVoiceTs;
+          // A short pause kicks ONE decode of everything said so far. It doubles
+          // as the final transcription, so the auto-send below is instant
+          // instead of paying a decode after stop.
+          if (!pausePreviewed && silentFor >= PAUSE_PREVIEW_MS && !liveBusy) {
+            pausePreviewed = true;
+            runLivePartial();
+          }
+          // Sustained silence → auto-send, reusing the pause decode.
+          if (silentFor >= SILENCE_MS) {
+            waveRaf = null;
+            reuseLiveOnStop = true;
+            stopListening(/* commit */ true);
+            return;
+          }
+        }
+      }
       const wave = $capCenter.querySelector(".cap-wave");
       if (wave) {
         const bars = wave.children;
@@ -781,17 +981,20 @@
     } catch {}
     return "";
   }
-  async function runLivePartial() {
+  // Decode what's been recorded so far (fired once per speech pause). The
+  // result is stashed in pendingUserText and reused by the auto-send on stop,
+  // so the same audio is never decoded twice. livePromise lets onstop await an
+  // in-flight decode before reading the text.
+  function runLivePartial() {
     if (liveBusy || mode !== "listening" || !recordedChunks.length) return;
     liveBusy = true;
-    try {
-      const text = await transcribeBuffered();
-      if (text && mode === "listening") {
-        pendingUserText = text;
-        // No visible live preview in the capsule wave mode; update is mostly
-        // useful for the conv pending-user partial during transcribing.
-      }
-    } finally { liveBusy = false; }
+    livePromise = (async () => {
+      try {
+        const text = await transcribeBuffered();
+        if (text && mode === "listening") pendingUserText = text;
+      } finally { liveBusy = false; }
+    })();
+    return livePromise;
   }
   // ── Send: text path + post-transcription commit path ─────────────────────
@@ -819,12 +1022,10 @@
   // one ResizeObserver tick later). Shared by commitUserMessage + regen so
   // both paths set up the daemon-event pipeline identically.
   function startAgentTurn() {
-    doneHandled = false;
-    pendingTtsTurnId = null;
-    if (ttsTimer) { clearTimeout(ttsTimer); ttsTimer = null; }
+    beginAgentTurn();      // bump currentTurn + reset the audio queue/guards
     mode = "thinking";
     render();
-    ensureStreamingAgentBubble();
+    ensureConv();          // segments will mount their own bubbles
     requestWindowResize();
   }
@@ -861,73 +1062,88 @@
   window.apx?.onDaemonEvent?.((msg) => {
     switch (msg.type) {
       case "thinking":
-        if (mode !== "thinking" && mode !== "speaking") { mode = "thinking"; render(); }
-        ensureStreamingAgentBubble();
+        // Marks the start of a turn. For locally-initiated turns startAgentTurn
+        // already ran beginAgentTurn() (mode is already "thinking"); for turns
+        // NOT initiated in this window (injected / broadcast from another client)
+        // we set them up here so currentTurn/queue/doneHandled are correct and
+        // the turn doesn't hang.
+        if (mode !== "thinking" && mode !== "speaking") {
+          beginAgentTurn();
+          mode = "thinking";
+          render();
+        } else {
+          doneHandled = false;
+        }
+        ensureConv();
         break;
       case "token":
+        // Legacy path (backend no longer streams tokens for desktop). Kept so a
+        // mixed-version daemon doesn't break — accumulate into a single bubble.
         appendStreamingToken(msg.text || "");
         break;
       case "tool_start":  addToolPill(msg.name); break;
       case "tool_done":   updateToolPill(msg.name); break;
+      case "segment": {
+        // Each segment is its own agent message bubble + its own audio.
+        ensureConv();
+        const text = (msg.text || "").trim();
+        if (!text) break;
+        const id = nextId++;
+        const m = { id, seq: msg.seq || 0, turn: currentTurn, role: "agent", text, t: nowHHMM(), audio: null, dur: null };
+        messages.push(m);
+        appendTurn(m, true);
+        queueRegisterSegment(m);
+        // Synthesize THIS segment; tts-ready(seg=id) attaches its audio + queues
+        // it for gapless sequential playback.
+        window.apx?.requestTts?.(text, id);
+        requestWindowResize();
+        scrollConvToBottom();
+        break;
+      }
       case "done": {
-        // Daemon may emit `done` twice (retry/race). Process only once per turn.
         if (doneHandled) break;
         doneHandled = true;
-        const finalText = msg.text || streamingAgentEntry?.text || "";
-        // CRITICAL: many models (gemini-flash, groq-fast tier) don't stream
-        // tokens — they send the whole reply in `done`. Without this branch
-        // the bubble stays with just the dots placeholder until TTS resolves
-        // (or 6s timeout), which feels broken. Inject the text NOW so the
-        // user sees the reply immediately.
-        if (streamingAgentEntry) {
-          streamingAgentEntry.text = finalText;
-          if (!streamingAgentEntry.started && finalText) {
-            streamingAgentEntry.started = true;
-            streamingAgentEntry.msgEl.innerHTML = formatWordsHtml(finalText);
-            scrollConvToBottom();
-          }
-        }
-        // Finalize and return to idle right away so the capsule frees up.
-        // TTS runs in the background; tts-ready will attach the scrubber to
-        // the already-rendered turn (see attachAudioToLastAgentTurn below).
-        const finalizedTurnId = streamingAgentEntry?.id;
-        finalizeStreamingAgent();
-        mode = "idle"; render();
-        // Fire-and-forget TTS request. If it returns audio, attach it to
-        // the turn we just rendered; if it errors / times out / never replies,
-        // no big deal — the user already has the text. Guard with a 6s soft
-        // timeout so a stuck request doesn't hold ttsTimer state.
-        const handled = window.apx?.requestTts?.(finalText);
-        if (handled) {
-          if (ttsTimer) clearTimeout(ttsTimer);
-          ttsTimer = setTimeout(() => { ttsTimer = null; }, 6000);
-          // Remember which turn the next tts-ready/failed belongs to.
-          pendingTtsTurnId = finalizedTurnId || null;
+        turnDone = true;
+        // Record the whole turn as one assistant entry for conversation context.
+        const full = (msg.text || "").trim();
+        if (full) history.push({ role: "assistant", content: full });
+        // Safety net: if some segment's TTS never resolves, flush after 12s so
+        // the capsule can't get stuck in "Pensando…".
+        if (turnWatchdog) clearTimeout(turnWatchdog);
+        turnWatchdog = setTimeout(() => {
+          turnAudios.forEach((e) => { if (!e.ready) { e.ready = true; e.failed = true; e.played = true; } });
+          pumpAudioQueue();
+        }, 12000);
+        // Play whatever audio is already ready; flip to idle if there's nothing
+        // left to play (e.g. a turn that produced no audio).
+        pumpAudioQueue();
+        if (!queuePlaying && audioCursor >= turnAudios.length && mode !== "speaking") {
+          mode = "idle"; render();
         }
         break;
       }
-      case "tts-ready": {
-        if (ttsTimer) { clearTimeout(ttsTimer); ttsTimer = null; }
-        if (pendingTtsTurnId != null) {
-          attachAudioToTurn(pendingTtsTurnId, { url: msg.url, dur: msg.duration });
-          pendingTtsTurnId = null;
-        }
+      case "tts-ready":
+        if (msg.seg != null) attachAudioToTurn(msg.seg, { url: msg.url, dur: msg.duration });
         break;
-      }
-      case "tts-failed":
-        // The text is already on screen; just clean up the timer + pending id.
-        if (ttsTimer) { clearTimeout(ttsTimer); ttsTimer = null; }
-        pendingTtsTurnId = null;
+      case "tts-failed": {
+        // No audio for this segment — skip it in the queue so playback advances.
+        const m = (msg.seg != null) ? messages.find((x) => x.id === msg.seg) : null;
+        if (m) queueMarkFailed(m);
         break;
-      case "error":
-        finalizeStreamingAgentError(msg.message || "Unknown error");
+      }
+      case "error": {
+        ensureConv();
+        const id = nextId++;
+        const m = { id, seq: 9999, turn: currentTurn, role: "agent", text: "Error: " + (msg.message || "Unknown error"), t: nowHHMM(), isError: true };
+        messages.push(m);
+        appendTurn(m, true);
+        turnDone = true;
+        if (mode !== "speaking") { mode = "idle"; render(); }
         break;
+      }
       case "cancelled":
-        if (streamingAgentEntry) {
-          if (!streamingAgentEntry.text) streamingAgentEntry.el.remove();
-          else finalizeStreamingAgent();
-          streamingAgentEntry = null;
-        }
+        resetTurnAudio();
+        turnDone = true;
         mode = "idle"; render();
         break;
     }
@@ -949,8 +1165,20 @@
   document.addEventListener("keydown", (e) => {
     if (e.key === "Escape") {
       e.preventDefault();
-      if (mode === "listening" || mode === "transcribing" || mode === "thinking" || mode === "speaking") cancel();
-      else closeWindow();
+      // Escape cancels whatever is in flight (recording / transcribing /
+      // thinking / speaking). If nothing is in flight, a half-typed draft is
+      // cleared first; only an empty idle capsule closes the window.
+      if (mode === "listening" || mode === "transcribing" || mode === "thinking" || mode === "speaking") {
+        cancel();
+        return;
+      }
+      const input = $capCenter.querySelector("input");
+      if (input && input.value.trim()) {
+        input.value = "";
+        render();
+      } else {
+        closeWindow();
+      }
     }
   });
@@ -975,6 +1203,13 @@
     setInterval(requestWindowResize, 250);
   }
+  // ── Keep STT warm ────────────────────────────────────────────────────────
+  // The whisper server idles out after ~10 min. While the desktop window is
+  // running we ping it every 4 min (and once now) so it stays loaded — the
+  // user's first utterance never pays the cold-load cost.
+  window.apx?.warmupStt?.();
+  setInterval(() => { window.apx?.warmupStt?.(); }, 4 * 60 * 1000);
   // ── Helpers ──────────────────────────────────────────────────────────────
   function nowHHMM() {
     const d = new Date();