@agentprojectcontext/apx 1.27.2 → 1.29.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@agentprojectcontext/apx",
3
- "version": "1.27.2",
3
+ "version": "1.29.0",
4
4
  "description": "APX — unified CLI + daemon for the Agent Project Context (APC) standard.",
5
5
  "publishConfig": {
6
6
  "access": "public"
@@ -6,6 +6,18 @@
6
6
  //
7
7
  // Shared by overlay, telegram voice messages, and any external caller.
8
8
  export function register(app) {
9
+ // GET /transcribe/warmup — load the local whisper model (if needed) and reset
10
+ // its idle watchdog. Callers (e.g. the desktop window) ping this while open so
11
+ // the first real utterance doesn't pay the cold-load cost.
12
+ app.get("/transcribe/warmup", async (_req, res) => {
13
+ try {
14
+ const { warmupWhisper } = await import("../transcription.js");
15
+ res.json(await warmupWhisper());
16
+ } catch (e) {
17
+ res.status(500).json({ ok: false, error: e.message });
18
+ }
19
+ });
20
+
9
21
  app.post("/transcribe/chunk", async (req, res) => {
10
22
  const chunks = [];
11
23
  req.on("data", (c) => chunks.push(c));
@@ -100,9 +100,27 @@ async function _handleMessage({ ws, text, previousMessages }, { projects, config
100
100
  await appendGlobalMessage({ channel: CHANNEL, direction: "in", type: "user", author: "user", body: text });
101
101
  } catch {}
102
102
 
103
- let fullResponse = "";
104
103
  let toolsExecuted = [];
105
104
 
105
+ // Per-segment streaming: instead of merging the whole turn into one blob, we
106
+ // emit each assistant text piece as its own `segment` (an intro before a tool,
107
+ // then the post-tool answer, …). The renderer renders each as its own bubble
108
+ // and synthesizes its own audio, so a multi-step reply reads as separate spoken
109
+ // messages instead of one run-on bubble. `liveBuf` accumulates streamed tokens
110
+ // (streaming engines) so they can be flushed as a segment at each boundary;
111
+ // for non-streaming models like gemini the text arrives whole via events.
112
+ let segSeq = 0;
113
+ let lastSegText = "";
114
+ let liveBuf = "";
115
+ const emittedSegments = [];
116
+ const emitSegment = (raw) => {
117
+ const seg = (raw || "").trim();
118
+ if (!seg || seg === lastSegText) return;
119
+ lastSegText = seg;
120
+ emittedSegments.push(seg);
121
+ _send(ws, { type: "segment", seq: ++segSeq, text: seg });
122
+ };
123
+
106
124
  try {
107
125
  if (!isSuperAgentEnabled(config)) {
108
126
  throw new Error("super-agent not enabled — set super_agent.enabled + super_agent.model in ~/.apx/config.json");
@@ -120,10 +138,7 @@ async function _handleMessage({ ws, text, previousMessages }, { projects, config
120
138
  previousMessages: history.slice(0, -1),
121
139
  overrideModel: cfg.model || null,
122
140
  signal: controller.signal,
123
- onToken: (chunk) => {
124
- fullResponse += chunk;
125
- _send(ws, { type: "token", text: chunk });
126
- },
141
+ onToken: (chunk) => { liveBuf += chunk; },
127
142
  onEvent: async (event) => {
128
143
  if (event.type === "tool_start") {
129
144
  const t = event.trace;
@@ -131,17 +146,24 @@ async function _handleMessage({ ws, text, previousMessages }, { projects, config
131
146
  _send(ws, { type: "tool_start", name: t.tool, args: t.args });
132
147
  } else if (event.type === "tool_result") {
133
148
  _send(ws, { type: "tool_done", name: event.trace.tool });
134
- } else if (event.type === "assistant_text" && event.text && !fullResponse) {
135
- _send(ws, { type: "token", text: event.text });
136
- fullResponse += event.text;
149
+ } else if (event.type === "assistant_text" && event.text) {
150
+ // A complete assistant text segment (e.g. the "I'll check…" intro
151
+ // emitted right before a tool runs). Ship it as its own message.
152
+ emitSegment(event.text);
153
+ liveBuf = "";
137
154
  }
138
155
  },
139
156
  });
140
- const finalText = fullResponse || result.text || "";
141
- log(`desktop: super-agent turn done in ${Date.now() - t0}ms text_len=${finalText.length}`);
157
+ // The final (no-tool) iteration's answer appears ONLY in result.text (or, for
158
+ // streaming engines, in liveBuf) it's never emitted as an event. Ship it as
159
+ // the closing segment (deduped against the last one).
160
+ emitSegment((result.text || "").trim() || liveBuf.trim());
161
+
162
+ const finalText = emittedSegments.join("\n\n");
163
+ log(`desktop: super-agent turn done in ${Date.now() - t0}ms segments=${segSeq} text_len=${finalText.length} tools=${toolsExecuted.length}`);
142
164
 
143
- // Emit done with full text
144
- _send(ws, { type: "done", text: finalText });
165
+ // Turn end. `segments` lets the renderer know how many bubbles to expect.
166
+ _send(ws, { type: "done", segments: segSeq, text: finalText });
145
167
 
146
168
  // Append assistant turn to history
147
169
  if (ws && histories) {
@@ -481,6 +481,35 @@ export async function preloadWhisperServer(log = console.log) {
481
481
  }
482
482
  }
483
483
 
484
+ /**
485
+ * Keep the local whisper server warm. Ensures it's loaded and pings /health,
486
+ * which resets the server's idle watchdog so a live session (e.g. the desktop
487
+ * window held open) never pays the cold-load cost on the next utterance.
488
+ * Cheap and safe to call repeatedly. Never throws.
489
+ * Returns { ok, model?, loaded?, provider } for the caller to surface.
490
+ */
491
+ export async function warmupWhisper() {
492
+ try {
493
+ const cfg = await getConfig();
494
+ if (cfg.provider === "openai") return { ok: true, provider: "openai", loaded: false };
495
+ await ensureWhisperServer(cfg.local);
496
+ // /warmup loads the model into RAM (lazy otherwise) AND touches _last_used,
497
+ // resetting the idle timer. First call may block ~15-30s on a cold model;
498
+ // instant once warm. Generous timeout so the cold load can finish.
499
+ let loaded = false;
500
+ try {
501
+ const r = await fetch(`http://127.0.0.1:${WHISPER_PORT}/warmup`, {
502
+ signal: AbortSignal.timeout(40_000),
503
+ });
504
+ const j = await r.json().catch(() => ({}));
505
+ loaded = !!j.loaded;
506
+ } catch {}
507
+ return { ok: true, provider: "local", model: _serverModel, loaded };
508
+ } catch (e) {
509
+ return { ok: false, error: e.message };
510
+ }
511
+ }
512
+
484
513
  /**
485
514
  * Stop the whisper server we own (no-op if we adopted an external one).
486
515
  */
@@ -94,6 +94,17 @@ class _Handler(BaseHTTPRequestHandler):
94
94
  "model": _model_name or _Handler.model_name,
95
95
  "loaded": _model is not None,
96
96
  })
97
+ elif self.path == "/warmup":
98
+ # Eagerly load the model into RAM (no audio needed) and reset the
99
+ # idle timer, so the first real transcription isn't cold. Blocks
100
+ # until the model is loaded the first time; instant once warm.
101
+ _touch()
102
+ with _model_lock:
103
+ try:
104
+ _load_model_if_needed(_Handler.model_name, _Handler.device, _Handler.compute_type)
105
+ self._send_json(200, {"ok": True, "loaded": _model is not None, "model": _model_name})
106
+ except Exception as e:
107
+ self._send_json(500, {"ok": False, "error": f"model load failed: {e}"})
97
108
  else:
98
109
  self._send_json(404, {"ok": False, "error": "not found"})
99
110
 
@@ -46,6 +46,20 @@ function getShortcut() {
46
46
  return cfg?.desktop?.shortcut || cfg?.overlay?.shortcut || DEFAULT_SHORTCUT;
47
47
  }
48
48
 
49
+ // Voice-capture timing for the listening capsule. Overridable in config.json:
50
+ // "desktop": { "silence_ms": 1200, "voice_rms": 0.025 }
51
+ // silence_ms — quiet after speech before auto-send. voice_rms — RMS above
52
+ // which audio counts as voice (lower = more sensitive).
53
+ function getVoiceTiming() {
54
+ const cfg = readApxConfig();
55
+ const d = cfg?.desktop || cfg?.overlay || {};
56
+ const num = (v, def) => (typeof v === "number" && isFinite(v) ? v : def);
57
+ return {
58
+ silence_ms: Math.max(400, num(d.silence_ms, 1200)),
59
+ voice_rms: Math.max(0, num(d.voice_rms, 0.025)),
60
+ };
61
+ }
62
+
49
63
  function readToken() {
50
64
  try { return fs.readFileSync(TOKEN_PATH, "utf8").trim(); } catch { return ""; }
51
65
  }
@@ -397,6 +411,7 @@ ipcMain.handle("get-shortcut", () => getShortcut());
397
411
  ipcMain.handle("get-theme", () => getTheme());
398
412
  ipcMain.handle("get-position", () => getPosition());
399
413
  ipcMain.handle("get-agent-name", () => getAgentName());
414
+ ipcMain.handle("get-voice-timing", () => getVoiceTiming());
400
415
 
401
416
  // Renderer asks main to grow/shrink the window to fit its content.
402
417
  // Clamped to [WIN_H_MIN, getMaxWindowHeight()]; same anchor (top edge stays put).
@@ -411,27 +426,29 @@ ipcMain.on("resize-window", (_e, { height }) => {
411
426
  // Renderer asks for TTS playback of the agent reply. We synthesize via the
412
427
  // daemon and pipe the audio path back as a daemon-event the renderer already
413
428
  // knows how to consume (tts-ready { url, duration } / tts-failed).
414
- ipcMain.handle("request-tts", async (_e, { text }) => {
429
+ ipcMain.handle("request-tts", async (_e, { text, seg }) => {
415
430
  if (!text || !text.trim()) {
416
- mainWindow?.webContents.send("daemon-event", { type: "tts-failed" });
431
+ mainWindow?.webContents.send("daemon-event", { type: "tts-failed", seg });
417
432
  return;
418
433
  }
419
434
  try {
420
435
  const result = await daemonTtsSay(text);
421
436
  if (result?.ok && result.audio_path) {
422
437
  // Expose the local file via file:// — preload's contextIsolation lets
423
- // the renderer's <audio> tag fetch it directly.
438
+ // the renderer's <audio> tag fetch it directly. `seg` ties this audio to
439
+ // the bubble that asked for it.
424
440
  const url = "file://" + result.audio_path;
425
441
  mainWindow?.webContents.send("daemon-event", {
426
442
  type: "tts-ready",
443
+ seg,
427
444
  url,
428
445
  duration: result.duration_s || 0,
429
446
  });
430
447
  } else {
431
- mainWindow?.webContents.send("daemon-event", { type: "tts-failed", error: result?.error || "no audio" });
448
+ mainWindow?.webContents.send("daemon-event", { type: "tts-failed", seg, error: result?.error || "no audio" });
432
449
  }
433
450
  } catch (e) {
434
- mainWindow?.webContents.send("daemon-event", { type: "tts-failed", error: e.message });
451
+ mainWindow?.webContents.send("daemon-event", { type: "tts-failed", seg, error: e.message });
435
452
  }
436
453
  });
437
454
 
@@ -462,6 +479,32 @@ ipcMain.handle("check-whisper-ready", () => {
462
479
  });
463
480
  });
464
481
 
482
+ // Renderer asks to keep STT warm. Routed through the daemon (not whisper
483
+ // directly) so it both LOADS the model if it idled out and resets the idle
484
+ // watchdog. Fire-and-forget from the renderer's side.
485
+ ipcMain.handle("warmup-stt", async () => {
486
+ return new Promise((resolve) => {
487
+ const token = readToken();
488
+ const options = {
489
+ hostname: DAEMON_HOST,
490
+ port: DAEMON_PORT,
491
+ path: "/transcribe/warmup",
492
+ method: "GET",
493
+ headers: { ...(token ? { "Authorization": `Bearer ${token}` } : {}) },
494
+ };
495
+ const req = http.request(options, (res) => {
496
+ let data = "";
497
+ res.on("data", (c) => data += c);
498
+ res.on("end", () => { try { resolve(JSON.parse(data)); } catch { resolve({ ok: false }); } });
499
+ });
500
+ req.on("error", () => resolve({ ok: false }));
501
+ // Cold model load can take ~30s; give it room. (Renderer fires this
502
+ // fire-and-forget, so a long warm-up never blocks the UI.)
503
+ req.setTimeout(45000, () => { req.destroy(); resolve({ ok: false }); });
504
+ req.end();
505
+ });
506
+ });
507
+
465
508
  // Renderer requests recording toggle (ESC cancels, shortcut toggles)
466
509
  ipcMain.handle("toggle-recording", async () => {
467
510
  if (isRecording) stopRecording(); else startRecording();
@@ -18,14 +18,19 @@ contextBridge.exposeInMainWorld("apx", {
18
18
  // Check if the whisper model is loaded (false = still loading)
19
19
  checkWhisperReady: () => ipcRenderer.invoke("check-whisper-ready"),
20
20
 
21
+ // Keep STT warm (loads the model if idle + resets the idle timer). Called
22
+ // while the window is open / on mic-open so the first decode isn't cold.
23
+ warmupStt: () => ipcRenderer.invoke("warmup-stt").catch(() => ({ ok: false })),
24
+
21
25
  // Send final text to daemon
22
26
  sendMessage: (text, previousMessages) =>
23
27
  ipcRenderer.invoke("send-message", { text, previousMessages }),
24
28
 
25
- // After "done", ask main to synthesize TTS. Returns true if main will reply
26
- // with a tts-ready / tts-failed daemon-event; false if TTS is not wired.
27
- requestTts: (text) => {
28
- ipcRenderer.invoke("request-tts", { text }).catch(() => {});
29
+ // Ask main to synthesize TTS for one segment. `seg` correlates the resulting
30
+ // tts-ready/tts-failed event back to the bubble that requested it (each
31
+ // assistant message has its own audio). Returns true optimistically.
32
+ requestTts: (text, seg) => {
33
+ ipcRenderer.invoke("request-tts", { text, seg }).catch(() => {});
29
34
  return true; // optimistic; renderer waits for the event either way
30
35
  },
31
36
 
@@ -42,6 +47,7 @@ contextBridge.exposeInMainWorld("apx", {
42
47
  getTheme: () => ipcRenderer.invoke("get-theme"),
43
48
  getPosition: () => ipcRenderer.invoke("get-position"),
44
49
  getAgentName: () => ipcRenderer.invoke("get-agent-name"),
50
+ getVoiceTiming: () => ipcRenderer.invoke("get-voice-timing"),
45
51
 
46
52
  // Renderer asks main to resize the BrowserWindow to the rendered height
47
53
  resize: (height) => ipcRenderer.send("resize-window", { height }),
@@ -31,15 +31,49 @@
31
31
  let recorderFormat = "webm";
32
32
  let liveBusy = false;
33
33
 
34
+ // Mic is async to open (getUserMedia + recorder warm-up). Until it's actually
35
+ // capturing we show a "Cargando…" state instead of the wave, so the user
36
+ // doesn't talk into the dead gap before the recorder starts.
37
+ let micReady = false;
38
+
39
+ // Silence auto-send: once speech has been heard, SILENCE_MS of quiet
40
+ // auto-commits the recording. RMS (time-domain) is the voice/silence gate.
41
+ // Both are overridable from config.json (desktop.silence_ms / voice_rms).
42
+ let speechSeen = false;
43
+ let lastVoiceTs = 0;
44
+ let SILENCE_MS = 1200; // quiet after speech → send on its own
45
+ let VOICE_RMS = 0.025; // RMS above this counts as voice (0 = silence)
46
+ const PAUSE_PREVIEW_MS = 600; // a short pause kicks ONE decode (reused on send)
47
+
48
+ // When a pause triggers a preview decode, that decode already covers all the
49
+ // speech (the tail is just trailing silence), so the auto-send reuses it
50
+ // instead of paying a second full decode. These coordinate that handoff.
51
+ let pausePreviewed = false; // a preview decode fired for the current pause
52
+ let reuseLiveOnStop = false; // commit should reuse pendingUserText, not re-decode
53
+ let livePromise = null; // in-flight preview decode (awaited on reuse)
54
+
34
55
  // Web Audio analyser — drives the live capsule wave from real mic amplitude
35
56
  let audioCtx = null;
36
57
  let analyser = null;
37
58
  let freqData = null;
59
+ let timeData = null;
38
60
  let waveRaf = null;
39
61
 
40
- let streamingAgentEntry = null; // { id, role:'agent', el, ... } during thinking/speaking
41
- let toolPillsByName = {}; // active tool pills inside the streaming bubble row
42
- let ttsAudio = null; // <audio> playing the agent reply
62
+ let streamingAgentEntry = null; // legacy single-bubble streaming (kept dormant)
63
+ let toolPillsByName = {}; // active tool pills, by tool name, for the live turn
64
+ let ttsAudio = null; // <audio> currently playing
65
+
66
+ // ── Per-segment turn rendering ──────────────────────────────────────────
67
+ // A turn is now N agent message bubbles (intro, post-tool answer, …), each
68
+ // with its own audio. `currentTurn` tags every bubble of a turn so regen can
69
+ // drop the whole turn. The audio queue plays segment audios in seq order
70
+ // (gapless auto-play), waiting at the cursor for each segment's TTS to land.
71
+ let currentTurn = 0;
72
+ let turnAudios = []; // [{ m, ready, failed, played }] ordered by seq
73
+ let audioCursor = 0; // index of the next segment to play
74
+ let queuePlaying = false; // a segment audio is currently playing
75
+ let turnDone = false; // `done` received for the active turn
76
+ let turnWatchdog = null; // flushes the queue if a segment's TTS hangs
43
77
 
44
78
  let history = []; // [{role:'user'|'assistant', content}] sent to daemon for context
45
79
  let theme = "light";
@@ -119,10 +153,15 @@
119
153
  window.apx?.getPosition?.() ?? "right",
120
154
  window.apx?.getShortcut?.() ?? "CommandOrControl+G",
121
155
  window.apx?.getAgentName?.() ?? "Superagente",
122
- ]).then(([th, pos, shortcut, name]) => {
156
+ window.apx?.getVoiceTiming?.() ?? null,
157
+ ]).then(([th, pos, shortcut, name, timing]) => {
123
158
  theme = th || "light";
124
159
  position = pos || "right";
125
160
  agentName = (name && String(name).trim()) || "Superagente";
161
+ if (timing) {
162
+ if (typeof timing.silence_ms === "number") SILENCE_MS = timing.silence_ms;
163
+ if (typeof timing.voice_rms === "number") VOICE_RMS = timing.voice_rms;
164
+ }
126
165
  document.documentElement.setAttribute("data-theme", theme);
127
166
  setPosition(position);
128
167
  initialCaption(shortcut);
@@ -217,6 +256,13 @@
217
256
  }
218
257
  }
219
258
  // else: input already there → leave it alone (preserves focus + caret)
259
+ } else if (mode === "listening" && !micReady) {
260
+ // Mic still opening (getUserMedia + recorder warm-up). Show a loading
261
+ // status so the user waits for capture instead of talking into the gap.
262
+ if ($capCenter.dataset.mode !== "loading") {
263
+ $capCenter.dataset.mode = "loading";
264
+ $capCenter.innerHTML = `<span class="status"><span class="dots"><i></i><i></i><i></i></span><span class="shimmer">Cargando…</span></span>`;
265
+ }
220
266
  } else if (mode === "listening") {
221
267
  // Only rebuild the wave if it's not already there (avoids restarting
222
268
  // CSS animations / Web Audio binding every render).
@@ -246,9 +292,10 @@
246
292
  }
247
293
  }
248
294
  }
249
- // Clear data-mode when we're back to idle/listening so a future busy mode
250
- // re-renders correctly.
251
- if (mode === "idle" || mode === "listening") $capCenter.dataset.mode = "";
295
+ // Clear data-mode when we're back to idle, or once the live wave is up, so
296
+ // a future busy mode re-renders correctly. While the mic is still warming
297
+ // up we keep the "loading" marker so "Cargando…" isn't rebuilt every frame.
298
+ if (mode === "idle" || (mode === "listening" && micReady)) $capCenter.dataset.mode = "";
252
299
 
253
300
  // actions
254
301
  $capActions.innerHTML = "";
@@ -273,7 +320,8 @@
273
320
  }
274
321
  } else if (mode === "listening") {
275
322
  addBtn("ghost", "Cancelar", ICON.x(), () => cancel());
276
- addBtn("", "Enviar", ICON.send(), () => stopListening(/* commit */ true));
323
+ // No "Enviar" until the recorder is live nothing to send mid-warm-up.
324
+ if (micReady) addBtn("", "Enviar", ICON.send(), () => stopListening(/* commit */ true));
277
325
  } else if (mode === "transcribing") {
278
326
  addBtn("ghost", "Cancelar", ICON.x(), () => cancel());
279
327
  } else if (mode === "thinking") {
@@ -321,7 +369,6 @@
321
369
  // Re-render all existing turns
322
370
  messages.forEach((m, i) => appendTurn(m, i === messages.length - 1));
323
371
  if (mode === "transcribing") renderPendingUserPartial();
324
- if (mode === "thinking" || mode === "speaking") ensureStreamingAgentBubble();
325
372
  }
326
373
  }
327
374
 
@@ -342,17 +389,28 @@
342
389
  <div class="bubble-user">${escapeHtml(m.text)}${viaIcon}</div>
343
390
  `;
344
391
  } else {
345
- t.innerHTML = `
392
+ // Consecutive agent messages (intro + post-tool answer …) read as one
393
+ // continued reply: only the FIRST shows the "Roby" header — the rest skip
394
+ // it so a tool turn isn't a stack of repeated "Roby" labels. A new header
395
+ // only appears when something (a user message) breaks the run.
396
+ const idx = messages.indexOf(m);
397
+ const prevMsg = idx > 0 ? messages[idx - 1] : null;
398
+ const agentCont = !!(prevMsg && prevMsg.role === "agent");
399
+ if (agentCont) t.classList.add("cont");
400
+ const header = agentCont ? "" : `
346
401
  <div class="role agent">
347
402
  <span class="ava sa"><img src="assets/superagent.png" alt=""/></span>
348
403
  <span class="who">${escapeHtml(agentName)}</span>
349
404
  <span class="time">${m.t || ""}</span>
350
- </div>
351
- <div class="msg-agent">${formatWordsHtml(m.text)}</div>
352
- ${m.audio ? "" /* scrubber added separately */ : ""}
405
+ </div>`;
406
+ // Copy is an inline icon at the end of the text, hover-only, so it never
407
+ // reserves an empty row. Regenerate lives in turn-actions and CSS shows it
408
+ // only on the last turn.
409
+ t.innerHTML = `
410
+ ${header}
411
+ <div class="msg-agent">${formatWordsHtml(m.text)}<button class="btn-copy" aria-label="Copiar" title="Copiar">${ICON.copy()}</button></div>
353
412
  <div class="turn-actions">
354
413
  <button class="chip btn-regen">${ICON.refresh()} Regenerar</button>
355
- <button class="chip btn-copy">${ICON.copy()} Copiar</button>
356
414
  </div>
357
415
  `;
358
416
  if (m.audio && m.dur) {
@@ -362,13 +420,13 @@
362
420
  actions.insertAdjacentHTML("beforebegin", scrubberHtml);
363
421
  wireScrubber(t, m);
364
422
  }
365
- // copy
423
+ // copy (inline icon → swaps to a check briefly)
366
424
  t.querySelector(".btn-copy")?.addEventListener("click", (e) => {
367
425
  navigator.clipboard?.writeText(m.text).catch(() => {});
368
426
  const btn = e.currentTarget;
369
427
  btn.classList.add("done");
370
- btn.innerHTML = `${ICON.check()} Copiado`;
371
- setTimeout(() => { btn.classList.remove("done"); btn.innerHTML = `${ICON.copy()} Copiar`; }, 1400);
428
+ btn.innerHTML = ICON.check();
429
+ setTimeout(() => { btn.classList.remove("done"); btn.innerHTML = ICON.copy(); }, 1400);
372
430
  });
373
431
  // regen: only the LAST agent turn can be regenerated. Past turns
374
432
  // can't because we'd have to re-issue the user prompt that came right
@@ -388,7 +446,10 @@
388
446
  if (history.length && history[history.length - 1].role === "assistant") {
389
447
  history.pop();
390
448
  }
391
- messages = messages.filter((x) => x.id !== m.id);
449
+ // A turn can be several agent bubbles (intro + post-tool answer…); drop
450
+ // them all so regen replaces the whole turn, not just the last segment.
451
+ const turnId = m.turn;
452
+ messages = messages.filter((x) => !(x.role === "agent" && turnId != null && x.turn === turnId) && x.id !== m.id);
392
453
  rebuildConvFromState();
393
454
  startAgentTurn();
394
455
  sendToDaemon(lastUser.text);
@@ -499,12 +560,14 @@
499
560
  }
500
561
 
501
562
  function addToolPill(name) {
502
- ensureStreamingAgentBubble();
503
- if (toolPillsByName[name]) return;
563
+ ensureConv();
564
+ if (!$convScroll || toolPillsByName[name]) return;
504
565
  const pill = document.createElement("div");
505
566
  pill.className = "tool-pill";
506
567
  pill.innerHTML = `<div class="spinner"></div><span>${escapeHtml(name)}</span>`;
507
- $convScroll.insertBefore(pill, streamingAgentEntry.el);
568
+ // Append at the end of the conversation flow — pills sit between the
569
+ // segment bubbles in the order tools actually run.
570
+ $convScroll.appendChild(pill);
508
571
  toolPillsByName[name] = pill;
509
572
  scrollConvToBottom();
510
573
  }
@@ -541,53 +604,46 @@
541
604
  const dur = m.dur || 1;
542
605
  const fmt = (s) => `0:${String(Math.round(s)).padStart(2, "0")}`;
543
606
  const audio = new Audio(m.audio);
607
+ m._audioEl = audio; // the audio queue drives sequential playback
544
608
  let raf = null;
545
- let progress = 0;
546
609
 
547
610
  const setProgress = (p) => {
548
- progress = Math.max(0, Math.min(1, p));
549
- const cur = Math.floor(progress * N);
611
+ p = Math.max(0, Math.min(1, p));
612
+ const cur = Math.floor(p * N);
550
613
  bars.forEach((b, i) => {
551
614
  b.classList.toggle("on", i <= cur);
552
615
  b.classList.toggle("cur", i === cur && !audio.paused);
553
616
  });
554
- $dur.textContent = progress > 0 || !audio.paused ? fmt(progress * dur) : fmt(dur);
617
+ $dur.textContent = p > 0 || !audio.paused ? fmt(p * dur) : fmt(dur);
555
618
  };
556
-
557
619
  const tick = () => {
558
620
  if (audio.duration > 0) setProgress(audio.currentTime / audio.duration);
559
621
  raf = requestAnimationFrame(tick);
560
622
  };
561
- audio.addEventListener("play", () => { $play.innerHTML = ICON.pause(); raf = requestAnimationFrame(tick); mode = "speaking"; render(); });
562
- audio.addEventListener("pause", () => { $play.innerHTML = ICON.play(); if (raf) cancelAnimationFrame(raf); if (mode === "speaking") { mode = "idle"; render(); } });
563
- audio.addEventListener("ended", () => { setProgress(1); if (mode === "speaking") { mode = "idle"; render(); } });
623
+ audio.addEventListener("play", () => { $play.innerHTML = ICON.pause(); raf = requestAnimationFrame(tick); if (mode !== "speaking") { mode = "speaking"; render(); } });
624
+ audio.addEventListener("pause", () => { $play.innerHTML = ICON.play(); if (raf) cancelAnimationFrame(raf); });
625
+ audio.addEventListener("ended", () => { $play.innerHTML = ICON.play(); if (raf) cancelAnimationFrame(raf); setProgress(1); onSegmentEnded(m); });
626
+ // 404 / decode error / autoplay block: don't hang — advance the queue.
627
+ audio.addEventListener("error", () => onSegmentEnded(m));
564
628
 
565
- $play.addEventListener("click", () => audio.paused ? audio.play() : audio.pause());
629
+ $play.addEventListener("click", () => {
630
+ if (audio.paused) {
631
+ // Manual play takes control — stop the auto-sequence so we don't fight it.
632
+ queuePlaying = false;
633
+ try { if (ttsAudio && ttsAudio !== audio && !ttsAudio.ended) ttsAudio.pause(); } catch {}
634
+ ttsAudio = audio;
635
+ audio.play().catch(() => { if (mode === "speaking") { mode = "idle"; render(); } });
636
+ } else {
637
+ audio.pause();
638
+ if (mode === "speaking") { mode = "idle"; render(); }
639
+ }
640
+ });
566
641
  $bar.addEventListener("click", (e) => {
567
642
  const r = $bar.getBoundingClientRect();
568
643
  const p = Math.max(0, Math.min(1, (e.clientX - r.left) / r.width));
569
644
  if (audio.duration > 0) audio.currentTime = p * audio.duration;
570
645
  setProgress(p);
571
646
  });
572
-
573
- // If the audio errors out (404, decode error, autoplay block, etc) make
574
- // sure the capsule doesn't stay stuck in "está hablando…".
575
- audio.addEventListener("error", () => {
576
- if (mode === "speaking") { mode = "idle"; render(); }
577
- });
578
-
579
- // autoplay if it's the fresh reply
580
- if (m.fresh) {
581
- m.fresh = false;
582
- ttsAudio?.pause?.();
583
- ttsAudio = audio;
584
- audio.play().catch(() => {
585
- // Autoplay block (rare in Electron with user-gesture but possible
586
- // when the window has never been focused). Bail out so the capsule
587
- // returns to idle and the user can still tap "play" on the scrubber.
588
- if (mode === "speaking" || mode === "thinking") { mode = "idle"; render(); }
589
- });
590
- }
591
647
  }
592
648
 
593
649
  // Post-finalize hook: add a scrubber to an already-rendered agent turn
@@ -597,15 +653,18 @@
597
653
  if (!m) return;
598
654
  m.audio = url;
599
655
  m.dur = dur || 0;
600
- m.fresh = true; // autoplay the freshly-arrived reply
601
656
  const turnEl = $convScroll?.querySelector(`[data-id="${turnId}"]`);
602
- if (!turnEl) return;
603
- // Insert the scrubber HTML just before turn-actions (matches appendTurn order).
604
- const actions = turnEl.querySelector(".turn-actions");
605
- const html = buildScrubberHtml(m);
606
- if (actions) actions.insertAdjacentHTML("beforebegin", html);
607
- else turnEl.insertAdjacentHTML("beforeend", html);
608
- wireScrubber(turnEl, m);
657
+ if (turnEl && !turnEl.querySelector(".audio")) {
658
+ // Insert the scrubber HTML just before turn-actions (matches appendTurn).
659
+ const actions = turnEl.querySelector(".turn-actions");
660
+ const html = buildScrubberHtml(m);
661
+ if (actions) actions.insertAdjacentHTML("beforebegin", html);
662
+ else turnEl.insertAdjacentHTML("beforeend", html);
663
+ wireScrubber(turnEl, m); // sets m._audioEl
664
+ }
665
+ // Audio is ready → let the sequential queue play it when it's this
666
+ // segment's turn (gapless auto-play across the turn's bubbles).
667
+ queueMarkReady(m);
609
668
  scrollConvToBottom();
610
669
  }
611
670
 
@@ -620,10 +679,94 @@
620
679
  return out;
621
680
  }
622
681
 
682
+ // ── Per-turn setup + sequential audio queue ──────────────────────────────
683
+ // Each turn renders N agent bubbles (segments), each with its own audio. We
684
+ // play those audios in `seq` order, gaplessly: the cursor waits at a segment
685
+ // until its TTS lands, plays it, then advances. So Roby "speaks" its messages
686
+ // one after another even though they synthesize at different speeds.
687
+ function beginAgentTurn() {
688
+ currentTurn++;
689
+ resetTurnAudio();
690
+ doneHandled = false;
691
+ pendingTtsTurnId = null;
692
+ if (ttsTimer) { clearTimeout(ttsTimer); ttsTimer = null; }
693
+ }
694
+ function resetTurnAudio() {
695
+ try { ttsAudio?.pause?.(); } catch {}
696
+ ttsAudio = null;
697
+ turnAudios = [];
698
+ audioCursor = 0;
699
+ queuePlaying = false;
700
+ turnDone = false;
701
+ if (turnWatchdog) { clearTimeout(turnWatchdog); turnWatchdog = null; }
702
+ }
703
+ function queueRegisterSegment(m) {
704
+ if (!turnAudios.some((e) => e.m === m)) {
705
+ turnAudios.push({ m, ready: false, failed: false, played: false });
706
+ turnAudios.sort((a, b) => (a.m.seq || 0) - (b.m.seq || 0));
707
+ }
708
+ }
709
+ function queueMarkReady(m) {
710
+ const e = turnAudios.find((x) => x.m === m);
711
+ if (e) e.ready = true;
712
+ pumpAudioQueue();
713
+ }
714
+ function queueMarkFailed(m) {
715
+ const e = turnAudios.find((x) => x.m === m);
716
+ if (e) { e.ready = true; e.failed = true; e.played = true; }
717
+ pumpAudioQueue();
718
+ }
719
+ function pumpAudioQueue() {
720
+ if (queuePlaying) return;
721
+ while (audioCursor < turnAudios.length) {
722
+ const e = turnAudios[audioCursor];
723
+ if (!e.ready) return; // wait for this segment's TTS
724
+ if (e.played || e.failed || !e.m._audioEl) { audioCursor++; continue; }
725
+ const audio = e.m._audioEl;
726
+ queuePlaying = true;
727
+ try { if (ttsAudio && ttsAudio !== audio && !ttsAudio.ended) ttsAudio.pause(); } catch {}
728
+ ttsAudio = audio;
729
+ audio.play().catch(() => { // autoplay blocked / decode error
730
+ queuePlaying = false;
731
+ e.played = true;
732
+ audioCursor++;
733
+ pumpAudioQueue();
734
+ });
735
+ return;
736
+ }
737
+ // Drained. Once the turn is done and nothing's left, return to idle.
738
+ if (turnDone) {
739
+ if (turnWatchdog) { clearTimeout(turnWatchdog); turnWatchdog = null; }
740
+ if (mode === "speaking" || mode === "thinking") { mode = "idle"; render(); }
741
+ }
742
+ }
743
+ // Called from a segment audio's `ended` (or `error`). Advances the queue.
744
+ function onSegmentEnded(m) {
745
+ const e = turnAudios.find((x) => x.m === m);
746
+ if (e) { if (e.played) return; e.played = true; }
747
+ if (queuePlaying && ttsAudio === m._audioEl) {
748
+ queuePlaying = false;
749
+ audioCursor++;
750
+ pumpAudioQueue();
751
+ } else if (mode === "speaking") {
752
+ mode = "idle"; render();
753
+ }
754
+ }
755
+
623
756
  // ── Recording flow ───────────────────────────────────────────────────────
624
757
  function startListening() {
625
758
  if (mode !== "idle") return;
626
759
  isCancelled = false;
760
+ micReady = false; // show "Cargando…" until the recorder is actually live
761
+ speechSeen = false;
762
+ lastVoiceTs = 0;
763
+ pausePreviewed = false;
764
+ reuseLiveOnStop = false;
765
+ livePromise = null;
766
+ pendingUserText = "";
767
+ // Warm the whisper model now (overlaps the mic warm-up), so the decode at
768
+ // the end of this utterance doesn't pay a cold start.
769
+ window.apx?.warmupStt?.();
627
770
  mode = "listening";
628
771
  render();
629
772
  startMic();
@@ -649,6 +792,7 @@
649
792
  if (mode === "listening") { stopMic(); }
650
793
  if (mode === "thinking" || mode === "speaking") { window.apx?.cancel?.(); }
651
794
  removePendingUserPartial();
795
+ resetTurnAudio(); // stop any playing/queued segment audio
652
796
  if (streamingAgentEntry) {
653
797
  streamingAgentEntry.el.remove();
654
798
  streamingAgentEntry = null;
@@ -658,6 +802,8 @@
658
802
  }
659
803
 
660
804
  function stopSpeaking() {
805
+ // Halt the auto-sequence and the current segment.
806
+ queuePlaying = false;
661
807
  try { ttsAudio?.pause?.(); } catch {}
662
808
  if (mode === "speaking") { mode = "idle"; render(); }
663
809
  }
@@ -678,6 +824,7 @@
678
824
  analyser.maxDecibels = -15; // ceiling (loud speech)
679
825
  src.connect(analyser);
680
826
  freqData = new Uint8Array(analyser.frequencyBinCount);
827
+ timeData = new Uint8Array(analyser.fftSize);
681
828
  startWaveLoop();
682
829
  } catch (e) {
683
830
  console.warn("desktop renderer: AnalyserNode init failed", e);
@@ -691,14 +838,27 @@
691
838
  recordedChunks = [];
692
839
  mediaRecorder = new MediaRecorder(audioStream, { mimeType, audioBitsPerSecond: 32000 });
693
840
  mediaRecorder.ondataavailable = (e) => {
841
+ // Just buffer. We deliberately do NOT decode on every chunk anymore —
842
+ // re-decoding the growing clip every 2s serialized on the single
843
+ // whisper thread and the final decode queued behind it (the old ~10s
844
+ // stall). Transcription now happens once, on a pause / on stop.
694
845
  if (e.data && e.data.size > 0) recordedChunks.push(e.data);
695
- runLivePartial();
696
846
  };
697
847
  mediaRecorder.onstop = async () => {
698
848
  if (isCancelled) { recordedChunks = []; if (mode !== "idle") { mode = "idle"; render(); } return; }
699
- const raw = await transcribeBuffered();
700
- const text = (raw || "").trim();
849
+ let text = "";
850
+ // Auto-send after a pause: the pause already kicked a full decode that
851
+ // covers all the speech (the only thing after it is trailing silence),
852
+ // so reuse it instead of decoding the same audio again. Await the
853
+ // in-flight preview if it hasn't settled yet.
854
+ if (reuseLiveOnStop) {
855
+ if (livePromise) { try { await livePromise; } catch {} }
856
+ text = (pendingUserText || "").trim();
857
+ }
858
+ // Manual send (Enviar / ⌘G release) or no preview yet → one fresh decode.
859
+ if (!text) text = (await transcribeBuffered()).trim();
701
860
  recordedChunks = [];
861
+ reuseLiveOnStop = false;
702
862
  // Guard with .trim() — whisper occasionally returns a single space or
703
863
  // newline for very short clips, which used to commit an empty bubble.
704
864
  if (!text || isCancelled) {
@@ -711,7 +871,16 @@
711
871
  pendingUserText = text;
712
872
  commitUserMessage(text, /* via */ "voice");
713
873
  };
714
- mediaRecorder.start(2000);
874
+ // 1s timeslice: chunks land often enough that a pause-preview decode has
875
+ // audio to work with even for short utterances. We no longer decode per
876
+ // chunk (just buffer), so a smaller slice is essentially free.
877
+ mediaRecorder.start(1000);
878
+ // Recorder is now live → swap "Cargando…" for the reactive wave and let
879
+ // silence detection arm. lastVoiceTs starts now so a fully silent open
880
+ // won't auto-send (speechSeen gates that).
881
+ micReady = true;
882
+ lastVoiceTs = Date.now();
883
+ if (mode === "listening") render();
715
884
  } catch (e) {
716
885
  console.error("desktop renderer: mic error", e);
717
886
  mode = "idle";
@@ -723,11 +892,16 @@
723
892
  try { audioStream?.getTracks().forEach((t) => t.stop()); } catch {}
724
893
  mediaRecorder = null;
725
894
  audioStream = null;
895
+ micReady = false;
896
+ speechSeen = false;
897
+ lastVoiceTs = 0;
898
+ pausePreviewed = false;
726
899
  stopWaveLoop();
727
900
  try { audioCtx?.close(); } catch {}
728
901
  audioCtx = null;
729
902
  analyser = null;
730
903
  freqData = null;
904
+ timeData = null;
731
905
  }
732
906
 
733
907
  // ── Reactive wave: amplitude-driven bar heights (runs while mode === listening)
@@ -738,6 +912,43 @@
738
912
  const tick = () => {
739
913
  if (mode !== "listening" || !analyser) { waveRaf = null; return; }
740
914
  analyser.getByteFrequencyData(freqData);
915
+
916
+ // ── Silence auto-send ──────────────────────────────────────────────
917
+ // Time-domain RMS is a reliable voice/silence gate (unlike the freq
918
+ // bars, it's independent of the analyser's dB scaling). Once we've heard
919
+ // speech, SILENCE_MS of quiet commits the recording on its own.
920
+ if (micReady && timeData) {
921
+ analyser.getByteTimeDomainData(timeData);
922
+ let sumSq = 0;
923
+ for (let i = 0; i < timeData.length; i++) {
924
+ const v = (timeData[i] - 128) / 128;
925
+ sumSq += v * v;
926
+ }
927
+ const rms = Math.sqrt(sumSq / timeData.length);
928
+ const now = Date.now();
929
+ if (rms > VOICE_RMS) {
930
+ speechSeen = true;
931
+ lastVoiceTs = now;
932
+ pausePreviewed = false; // new speech → allow a fresh preview
933
+ } else if (speechSeen && lastVoiceTs) {
934
+ const silentFor = now - lastVoiceTs;
935
+ // A short pause kicks ONE decode of everything said so far. It doubles
936
+ // as the final transcription, so the auto-send below is instant
937
+ // instead of paying a decode after stop.
938
+ if (!pausePreviewed && silentFor >= PAUSE_PREVIEW_MS && !liveBusy) {
939
+ pausePreviewed = true;
940
+ runLivePartial();
941
+ }
942
+ // Sustained silence → auto-send, reusing the pause decode.
943
+ if (silentFor >= SILENCE_MS) {
944
+ waveRaf = null;
945
+ reuseLiveOnStop = true;
946
+ stopListening(/* commit */ true);
947
+ return;
948
+ }
949
+ }
950
+ }
951
+
741
952
  const wave = $capCenter.querySelector(".cap-wave");
742
953
  if (wave) {
743
954
  const bars = wave.children;
@@ -781,17 +992,20 @@
781
992
  } catch {}
782
993
  return "";
783
994
  }
784
- async function runLivePartial() {
995
+ // Decode what's been recorded so far (fired once per speech pause). The
996
+ // result is stashed in pendingUserText and reused by the auto-send on stop,
997
+ // so the same audio is never decoded twice. livePromise lets onstop await an
998
+ // in-flight decode before reading the text.
999
+ function runLivePartial() {
785
1000
  if (liveBusy || mode !== "listening" || !recordedChunks.length) return;
786
1001
  liveBusy = true;
787
- try {
788
- const text = await transcribeBuffered();
789
- if (text && mode === "listening") {
790
- pendingUserText = text;
791
- // No visible live preview in the capsule wave mode; update is mostly
792
- // useful for the conv pending-user partial during transcribing.
793
- }
794
- } finally { liveBusy = false; }
1002
+ livePromise = (async () => {
1003
+ try {
1004
+ const text = await transcribeBuffered();
1005
+ if (text && mode === "listening") pendingUserText = text;
1006
+ } finally { liveBusy = false; }
1007
+ })();
1008
+ return livePromise;
795
1009
  }
796
1010
 
797
1011
  // ── Send: text path + post-transcription commit path ─────────────────────
@@ -819,12 +1033,10 @@
819
1033
  // one ResizeObserver tick later). Shared by commitUserMessage + regen so
820
1034
  // both paths set up the daemon-event pipeline identically.
821
1035
  function startAgentTurn() {
822
- doneHandled = false;
823
- pendingTtsTurnId = null;
824
- if (ttsTimer) { clearTimeout(ttsTimer); ttsTimer = null; }
1036
+ beginAgentTurn(); // bump currentTurn + reset the audio queue/guards
825
1037
  mode = "thinking";
826
1038
  render();
827
- ensureStreamingAgentBubble();
1039
+ ensureConv(); // segments will mount their own bubbles
828
1040
  requestWindowResize();
829
1041
  }
830
1042
 
@@ -861,73 +1073,88 @@
861
1073
  window.apx?.onDaemonEvent?.((msg) => {
862
1074
  switch (msg.type) {
863
1075
  case "thinking":
864
- if (mode !== "thinking" && mode !== "speaking") { mode = "thinking"; render(); }
865
- ensureStreamingAgentBubble();
1076
+ // Marks the start of a turn. For locally-initiated turns startAgentTurn
1077
+ // already ran beginAgentTurn() (mode is already "thinking"); for turns
1078
+ // NOT initiated in this window (injected / broadcast from another client)
1079
+ // we set them up here so currentTurn/queue/doneHandled are correct and
1080
+ // the turn doesn't hang.
1081
+ if (mode !== "thinking" && mode !== "speaking") {
1082
+ beginAgentTurn();
1083
+ mode = "thinking";
1084
+ render();
1085
+ } else {
1086
+ doneHandled = false;
1087
+ }
1088
+ ensureConv();
866
1089
  break;
867
1090
  case "token":
1091
+ // Legacy path (backend no longer streams tokens for desktop). Kept so a
1092
+ // mixed-version daemon doesn't break — accumulate into a single bubble.
868
1093
  appendStreamingToken(msg.text || "");
869
1094
  break;
870
1095
  case "tool_start": addToolPill(msg.name); break;
871
1096
  case "tool_done": updateToolPill(msg.name); break;
1097
+ case "segment": {
1098
+ // Each segment is its own agent message bubble + its own audio.
1099
+ ensureConv();
1100
+ const text = (msg.text || "").trim();
1101
+ if (!text) break;
1102
+ const id = nextId++;
1103
+ const m = { id, seq: msg.seq || 0, turn: currentTurn, role: "agent", text, t: nowHHMM(), audio: null, dur: null };
1104
+ messages.push(m);
1105
+ appendTurn(m, true);
1106
+ queueRegisterSegment(m);
1107
+ // Synthesize THIS segment; tts-ready(seg=id) attaches its audio + queues
1108
+ // it for gapless sequential playback.
1109
+ window.apx?.requestTts?.(text, id);
1110
+ requestWindowResize();
1111
+ scrollConvToBottom();
1112
+ break;
1113
+ }
872
1114
  case "done": {
873
- // Daemon may emit `done` twice (retry/race). Process only once per turn.
874
1115
  if (doneHandled) break;
875
1116
  doneHandled = true;
876
- const finalText = msg.text || streamingAgentEntry?.text || "";
877
- // CRITICAL: many models (gemini-flash, groq-fast tier) don't stream
878
- // tokens they send the whole reply in `done`. Without this branch
879
- // the bubble stays with just the dots placeholder until TTS resolves
880
- // (or 6s timeout), which feels broken. Inject the text NOW so the
881
- // user sees the reply immediately.
882
- if (streamingAgentEntry) {
883
- streamingAgentEntry.text = finalText;
884
- if (!streamingAgentEntry.started && finalText) {
885
- streamingAgentEntry.started = true;
886
- streamingAgentEntry.msgEl.innerHTML = formatWordsHtml(finalText);
887
- scrollConvToBottom();
888
- }
889
- }
890
- // Finalize and return to idle right away so the capsule frees up.
891
- // TTS runs in the background; tts-ready will attach the scrubber to
892
- // the already-rendered turn (see attachAudioToLastAgentTurn below).
893
- const finalizedTurnId = streamingAgentEntry?.id;
894
- finalizeStreamingAgent();
895
- mode = "idle"; render();
896
- // Fire-and-forget TTS request. If it returns audio, attach it to
897
- // the turn we just rendered; if it errors / times out / never replies,
898
- // no big deal — the user already has the text. Guard with a 6s soft
899
- // timeout so a stuck request doesn't hold ttsTimer state.
900
- const handled = window.apx?.requestTts?.(finalText);
901
- if (handled) {
902
- if (ttsTimer) clearTimeout(ttsTimer);
903
- ttsTimer = setTimeout(() => { ttsTimer = null; }, 6000);
904
- // Remember which turn the next tts-ready/failed belongs to.
905
- pendingTtsTurnId = finalizedTurnId || null;
1117
+ turnDone = true;
1118
+ // Record the whole turn as one assistant entry for conversation context.
1119
+ const full = (msg.text || "").trim();
1120
+ if (full) history.push({ role: "assistant", content: full });
1121
+ // Safety net: if some segment's TTS never resolves, flush after 12s so
1122
+ // the capsule can't get stuck in "Pensando…".
1123
+ if (turnWatchdog) clearTimeout(turnWatchdog);
1124
+ turnWatchdog = setTimeout(() => {
1125
+ turnAudios.forEach((e) => { if (!e.ready) { e.ready = true; e.failed = true; e.played = true; } });
1126
+ pumpAudioQueue();
1127
+ }, 12000);
1128
+ // Play whatever audio is already ready; flip to idle if there's nothing
1129
+ // left to play (e.g. a turn that produced no audio).
1130
+ pumpAudioQueue();
1131
+ if (!queuePlaying && audioCursor >= turnAudios.length && mode !== "speaking") {
1132
+ mode = "idle"; render();
906
1133
  }
907
1134
  break;
908
1135
  }
909
- case "tts-ready": {
910
- if (ttsTimer) { clearTimeout(ttsTimer); ttsTimer = null; }
911
- if (pendingTtsTurnId != null) {
912
- attachAudioToTurn(pendingTtsTurnId, { url: msg.url, dur: msg.duration });
913
- pendingTtsTurnId = null;
914
- }
1136
+ case "tts-ready":
1137
+ if (msg.seg != null) attachAudioToTurn(msg.seg, { url: msg.url, dur: msg.duration });
915
1138
  break;
916
- }
917
- case "tts-failed":
918
- // The text is already on screen; just clean up the timer + pending id.
919
- if (ttsTimer) { clearTimeout(ttsTimer); ttsTimer = null; }
920
- pendingTtsTurnId = null;
1139
+ case "tts-failed": {
1140
+ // No audio for this segment — skip it in the queue so playback advances.
1141
+ const m = (msg.seg != null) ? messages.find((x) => x.id === msg.seg) : null;
1142
+ if (m) queueMarkFailed(m);
921
1143
  break;
922
- case "error":
923
- finalizeStreamingAgentError(msg.message || "Unknown error");
1144
+ }
1145
+ case "error": {
1146
+ ensureConv();
1147
+ const id = nextId++;
1148
+ const m = { id, seq: 9999, turn: currentTurn, role: "agent", text: "Error: " + (msg.message || "Unknown error"), t: nowHHMM(), isError: true };
1149
+ messages.push(m);
1150
+ appendTurn(m, true);
1151
+ turnDone = true;
1152
+ if (mode !== "speaking") { mode = "idle"; render(); }
924
1153
  break;
1154
+ }
925
1155
  case "cancelled":
926
- if (streamingAgentEntry) {
927
- if (!streamingAgentEntry.text) streamingAgentEntry.el.remove();
928
- else finalizeStreamingAgent();
929
- streamingAgentEntry = null;
930
- }
1156
+ resetTurnAudio();
1157
+ turnDone = true;
931
1158
  mode = "idle"; render();
932
1159
  break;
933
1160
  }
@@ -949,8 +1176,20 @@
949
1176
  document.addEventListener("keydown", (e) => {
950
1177
  if (e.key === "Escape") {
951
1178
  e.preventDefault();
952
- if (mode === "listening" || mode === "transcribing" || mode === "thinking" || mode === "speaking") cancel();
953
- else closeWindow();
1179
+ // Escape cancels whatever is in flight (recording / transcribing /
1180
+ // thinking / speaking). If nothing is in flight, a half-typed draft is
1181
+ // cleared first; only an empty idle capsule closes the window.
1182
+ if (mode === "listening" || mode === "transcribing" || mode === "thinking" || mode === "speaking") {
1183
+ cancel();
1184
+ return;
1185
+ }
1186
+ const input = $capCenter.querySelector("input");
1187
+ if (input && input.value.trim()) {
1188
+ input.value = "";
1189
+ render();
1190
+ } else {
1191
+ closeWindow();
1192
+ }
954
1193
  }
955
1194
  });
956
1195
 
@@ -975,6 +1214,13 @@
975
1214
  setInterval(requestWindowResize, 250);
976
1215
  }
977
1216
 
1217
+ // ── Keep STT warm ────────────────────────────────────────────────────────
1218
+ // The whisper server idles out after ~10 min. While the desktop window is
1219
+ // running we ping it every 4 min (and once now) so it stays loaded — the
1220
+ // user's first utterance never pays the cold-load cost.
1221
+ window.apx?.warmupStt?.();
1222
+ setInterval(() => { window.apx?.warmupStt?.(); }, 4 * 60 * 1000);
1223
+
978
1224
  // ── Helpers ──────────────────────────────────────────────────────────────
979
1225
  function nowHHMM() {
980
1226
  const d = new Date();
@@ -328,19 +328,30 @@ button { font-family: inherit; }
328
328
  .wavebar i.cur { transform: scaleY(1.25); }
329
329
  .audio .dur { font-size: 11px; color: var(--ink-3); font-variant-numeric: tabular-nums; min-width: 30px; text-align: right; }
330
330
 
331
- /* per-turn actions */
331
+ /* Continued agent messages (no repeated "Roby" header) hug the previous one. */
332
+ .turn.cont { padding-top: 0; }
333
+ .turn.cont .msg-agent { margin-top: 1px; }
334
+
335
+ /* Inline copy icon at the end of an agent message — hover-only, so it never
336
+ reserves an empty row of vertical space. */
337
+ .msg-agent .btn-copy {
338
+ display: inline-flex; align-items: center; vertical-align: -3px;
339
+ margin-left: 6px; padding: 1px; border: none; background: transparent;
340
+ cursor: pointer; color: var(--ink-3); opacity: 0; transition: opacity .15s ease, color .15s ease;
341
+ }
342
+ .turn:hover .msg-agent .btn-copy { opacity: .55; }
343
+ .msg-agent .btn-copy:hover { opacity: 1; color: var(--ink); }
344
+ .msg-agent .btn-copy.done { opacity: 1; color: oklch(0.6 0.15 150); }
345
+
346
+ /* per-turn actions — only Regenerate now, and only on the LAST agent turn.
347
+ Regenerating a past turn would replay the most-recent user prompt (not the
348
+ one that produced that reply) and silently break the flow, so it's hidden
349
+ everywhere else and takes no space. */
332
350
  .turn-actions {
333
- margin: 7px 0 0 24px; display: flex; gap: 4px;
351
+ margin: 7px 0 0 24px; display: none; gap: 4px;
334
352
  opacity: 0; transition: opacity .2s ease;
335
353
  }
336
- .turn:hover .turn-actions, .turn.last .turn-actions { opacity: 1; }
337
-
338
- /* Regenerate is only meaningful on the LAST agent turn — regenerating a
339
- past one would replay the most-recent user prompt (not the one that
340
- produced this reply) and silently break the conversation flow. Copy
341
- stays available on every turn so users can grab old replies. */
342
- .turn .btn-regen { display: none; }
343
- .turn.last .btn-regen { display: inline-flex; }
354
+ .turn.last .turn-actions { display: flex; opacity: 1; }
344
355
  .chip {
345
356
  display: inline-flex; align-items: center; gap: 5px; cursor: pointer;
346
357
  padding: 4px 8px; border-radius: 9px; border: 1px solid var(--glass-hairline);