@agentprojectcontext/apx 1.27.2 → 1.28.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@agentprojectcontext/apx",
3
- "version": "1.27.2",
3
+ "version": "1.28.0",
4
4
  "description": "APX — unified CLI + daemon for the Agent Project Context (APC) standard.",
5
5
  "publishConfig": {
6
6
  "access": "public"
@@ -6,6 +6,18 @@
6
6
  //
7
7
  // Shared by overlay, telegram voice messages, and any external caller.
8
8
  export function register(app) {
9
+ // GET /transcribe/warmup — load the local whisper model (if needed) and reset
10
+ // its idle watchdog. Callers (e.g. the desktop window) ping this while open so
11
+ // the first real utterance doesn't pay the cold-load cost.
12
+ app.get("/transcribe/warmup", async (_req, res) => {
13
+ try {
14
+ const { warmupWhisper } = await import("../transcription.js");
15
+ res.json(await warmupWhisper());
16
+ } catch (e) {
17
+ res.status(500).json({ ok: false, error: e.message });
18
+ }
19
+ });
20
+
9
21
  app.post("/transcribe/chunk", async (req, res) => {
10
22
  const chunks = [];
11
23
  req.on("data", (c) => chunks.push(c));
@@ -100,9 +100,27 @@ async function _handleMessage({ ws, text, previousMessages }, { projects, config
100
100
  await appendGlobalMessage({ channel: CHANNEL, direction: "in", type: "user", author: "user", body: text });
101
101
  } catch {}
102
102
 
103
- let fullResponse = "";
104
103
  let toolsExecuted = [];
105
104
 
105
+ // Per-segment streaming: instead of merging the whole turn into one blob, we
106
+ // emit each assistant text piece as its own `segment` (an intro before a tool,
107
+ // then the post-tool answer, …). The renderer renders each as its own bubble
108
+ // and synthesizes its own audio, so a multi-step reply reads as separate spoken
109
+ // messages instead of one run-on bubble. `liveBuf` accumulates streamed tokens
110
+ // (streaming engines) so they can be flushed as a segment at each boundary;
111
+ // for non-streaming models like gemini the text arrives whole via events.
112
+ let segSeq = 0;
113
+ let lastSegText = "";
114
+ let liveBuf = "";
115
+ const emittedSegments = [];
116
+ const emitSegment = (raw) => {
117
+ const seg = (raw || "").trim();
118
+ if (!seg || seg === lastSegText) return;
119
+ lastSegText = seg;
120
+ emittedSegments.push(seg);
121
+ _send(ws, { type: "segment", seq: ++segSeq, text: seg });
122
+ };
123
+
106
124
  try {
107
125
  if (!isSuperAgentEnabled(config)) {
108
126
  throw new Error("super-agent not enabled — set super_agent.enabled + super_agent.model in ~/.apx/config.json");
@@ -120,10 +138,7 @@ async function _handleMessage({ ws, text, previousMessages }, { projects, config
120
138
  previousMessages: history.slice(0, -1),
121
139
  overrideModel: cfg.model || null,
122
140
  signal: controller.signal,
123
- onToken: (chunk) => {
124
- fullResponse += chunk;
125
- _send(ws, { type: "token", text: chunk });
126
- },
141
+ onToken: (chunk) => { liveBuf += chunk; },
127
142
  onEvent: async (event) => {
128
143
  if (event.type === "tool_start") {
129
144
  const t = event.trace;
@@ -131,17 +146,24 @@ async function _handleMessage({ ws, text, previousMessages }, { projects, config
131
146
  _send(ws, { type: "tool_start", name: t.tool, args: t.args });
132
147
  } else if (event.type === "tool_result") {
133
148
  _send(ws, { type: "tool_done", name: event.trace.tool });
134
- } else if (event.type === "assistant_text" && event.text && !fullResponse) {
135
- _send(ws, { type: "token", text: event.text });
136
- fullResponse += event.text;
149
+ } else if (event.type === "assistant_text" && event.text) {
150
+ // A complete assistant text segment (e.g. the "I'll check…" intro
151
+ // emitted right before a tool runs). Ship it as its own message.
152
+ emitSegment(event.text);
153
+ liveBuf = "";
137
154
  }
138
155
  },
139
156
  });
140
- const finalText = fullResponse || result.text || "";
141
- log(`desktop: super-agent turn done in ${Date.now() - t0}ms text_len=${finalText.length}`);
157
+ // The final (no-tool) iteration's answer appears ONLY in result.text (or, for
158
+ // streaming engines, in liveBuf) it's never emitted as an event. Ship it as
159
+ // the closing segment (deduped against the last one).
160
+ emitSegment((result.text || "").trim() || liveBuf.trim());
161
+
162
+ const finalText = emittedSegments.join("\n\n");
163
+ log(`desktop: super-agent turn done in ${Date.now() - t0}ms segments=${segSeq} text_len=${finalText.length} tools=${toolsExecuted.length}`);
142
164
 
143
- // Emit done with full text
144
- _send(ws, { type: "done", text: finalText });
165
+ // Turn end. `segments` lets the renderer know how many bubbles to expect.
166
+ _send(ws, { type: "done", segments: segSeq, text: finalText });
145
167
 
146
168
  // Append assistant turn to history
147
169
  if (ws && histories) {
@@ -481,6 +481,35 @@ export async function preloadWhisperServer(log = console.log) {
481
481
  }
482
482
  }
483
483
 
484
+ /**
485
+ * Keep the local whisper server warm. Ensures it's loaded and pings /health,
486
+ * which resets the server's idle watchdog so a live session (e.g. the desktop
487
+ * window held open) never pays the cold-load cost on the next utterance.
488
+ * Cheap and safe to call repeatedly. Never throws.
489
+ * Returns { ok, model?, loaded?, provider } for the caller to surface.
490
+ */
491
+ export async function warmupWhisper() {
492
+ try {
493
+ const cfg = await getConfig();
494
+ if (cfg.provider === "openai") return { ok: true, provider: "openai", loaded: false };
495
+ await ensureWhisperServer(cfg.local);
496
+ // /warmup loads the model into RAM (lazy otherwise) AND touches _last_used,
497
+ // resetting the idle timer. First call may block ~15-30s on a cold model;
498
+ // instant once warm. Generous timeout so the cold load can finish.
499
+ let loaded = false;
500
+ try {
501
+ const r = await fetch(`http://127.0.0.1:${WHISPER_PORT}/warmup`, {
502
+ signal: AbortSignal.timeout(40_000),
503
+ });
504
+ const j = await r.json().catch(() => ({}));
505
+ loaded = !!j.loaded;
506
+ } catch {}
507
+ return { ok: true, provider: "local", model: _serverModel, loaded };
508
+ } catch (e) {
509
+ return { ok: false, error: e.message };
510
+ }
511
+ }
512
+
484
513
  /**
485
514
  * Stop the whisper server we own (no-op if we adopted an external one).
486
515
  */
@@ -94,6 +94,17 @@ class _Handler(BaseHTTPRequestHandler):
94
94
  "model": _model_name or _Handler.model_name,
95
95
  "loaded": _model is not None,
96
96
  })
97
+ elif self.path == "/warmup":
98
+ # Eagerly load the model into RAM (no audio needed) and reset the
99
+ # idle timer, so the first real transcription isn't cold. Blocks
100
+ # until the model is loaded the first time; instant once warm.
101
+ _touch()
102
+ with _model_lock:
103
+ try:
104
+ _load_model_if_needed(_Handler.model_name, _Handler.device, _Handler.compute_type)
105
+ self._send_json(200, {"ok": True, "loaded": _model is not None, "model": _model_name})
106
+ except Exception as e:
107
+ self._send_json(500, {"ok": False, "error": f"model load failed: {e}"})
97
108
  else:
98
109
  self._send_json(404, {"ok": False, "error": "not found"})
99
110
 
@@ -46,6 +46,20 @@ function getShortcut() {
46
46
  return cfg?.desktop?.shortcut || cfg?.overlay?.shortcut || DEFAULT_SHORTCUT;
47
47
  }
48
48
 
49
+ // Voice-capture timing for the listening capsule. Overridable in config.json:
50
+ // "desktop": { "silence_ms": 1200, "voice_rms": 0.025 }
51
+ // silence_ms — quiet after speech before auto-send. voice_rms — RMS above
52
+ // which audio counts as voice (lower = more sensitive).
53
+ function getVoiceTiming() {
54
+ const cfg = readApxConfig();
55
+ const d = cfg?.desktop || cfg?.overlay || {};
56
+ const num = (v, def) => (typeof v === "number" && isFinite(v) ? v : def);
57
+ return {
58
+ silence_ms: Math.max(400, num(d.silence_ms, 1200)),
59
+ voice_rms: Math.max(0, num(d.voice_rms, 0.025)),
60
+ };
61
+ }
62
+
49
63
  function readToken() {
50
64
  try { return fs.readFileSync(TOKEN_PATH, "utf8").trim(); } catch { return ""; }
51
65
  }
@@ -397,6 +411,7 @@ ipcMain.handle("get-shortcut", () => getShortcut());
397
411
  ipcMain.handle("get-theme", () => getTheme());
398
412
  ipcMain.handle("get-position", () => getPosition());
399
413
  ipcMain.handle("get-agent-name", () => getAgentName());
414
+ ipcMain.handle("get-voice-timing", () => getVoiceTiming());
400
415
 
401
416
  // Renderer asks main to grow/shrink the window to fit its content.
402
417
  // Clamped to [WIN_H_MIN, getMaxWindowHeight()]; same anchor (top edge stays put).
@@ -411,27 +426,29 @@ ipcMain.on("resize-window", (_e, { height }) => {
411
426
  // Renderer asks for TTS playback of the agent reply. We synthesize via the
412
427
  // daemon and pipe the audio path back as a daemon-event the renderer already
413
428
  // knows how to consume (tts-ready { url, duration } / tts-failed).
414
- ipcMain.handle("request-tts", async (_e, { text }) => {
429
+ ipcMain.handle("request-tts", async (_e, { text, seg }) => {
415
430
  if (!text || !text.trim()) {
416
- mainWindow?.webContents.send("daemon-event", { type: "tts-failed" });
431
+ mainWindow?.webContents.send("daemon-event", { type: "tts-failed", seg });
417
432
  return;
418
433
  }
419
434
  try {
420
435
  const result = await daemonTtsSay(text);
421
436
  if (result?.ok && result.audio_path) {
422
437
  // Expose the local file via file:// — preload's contextIsolation lets
423
- // the renderer's <audio> tag fetch it directly.
438
+ // the renderer's <audio> tag fetch it directly. `seg` ties this audio to
439
+ // the bubble that asked for it.
424
440
  const url = "file://" + result.audio_path;
425
441
  mainWindow?.webContents.send("daemon-event", {
426
442
  type: "tts-ready",
443
+ seg,
427
444
  url,
428
445
  duration: result.duration_s || 0,
429
446
  });
430
447
  } else {
431
- mainWindow?.webContents.send("daemon-event", { type: "tts-failed", error: result?.error || "no audio" });
448
+ mainWindow?.webContents.send("daemon-event", { type: "tts-failed", seg, error: result?.error || "no audio" });
432
449
  }
433
450
  } catch (e) {
434
- mainWindow?.webContents.send("daemon-event", { type: "tts-failed", error: e.message });
451
+ mainWindow?.webContents.send("daemon-event", { type: "tts-failed", seg, error: e.message });
435
452
  }
436
453
  });
437
454
 
@@ -462,6 +479,32 @@ ipcMain.handle("check-whisper-ready", () => {
462
479
  });
463
480
  });
464
481
 
482
+ // Renderer asks to keep STT warm. Routed through the daemon (not whisper
483
+ // directly) so it both LOADS the model if it idled out and resets the idle
484
+ // watchdog. Fire-and-forget from the renderer's side.
485
+ ipcMain.handle("warmup-stt", async () => {
486
+ return new Promise((resolve) => {
487
+ const token = readToken();
488
+ const options = {
489
+ hostname: DAEMON_HOST,
490
+ port: DAEMON_PORT,
491
+ path: "/transcribe/warmup",
492
+ method: "GET",
493
+ headers: { ...(token ? { "Authorization": `Bearer ${token}` } : {}) },
494
+ };
495
+ const req = http.request(options, (res) => {
496
+ let data = "";
497
+ res.on("data", (c) => data += c);
498
+ res.on("end", () => { try { resolve(JSON.parse(data)); } catch { resolve({ ok: false }); } });
499
+ });
500
+ req.on("error", () => resolve({ ok: false }));
501
+ // Cold model load can take ~30s; give it room. (Renderer fires this
502
+ // fire-and-forget, so a long warm-up never blocks the UI.)
503
+ req.setTimeout(45000, () => { req.destroy(); resolve({ ok: false }); });
504
+ req.end();
505
+ });
506
+ });
507
+
465
508
  // Renderer requests recording toggle (ESC cancels, shortcut toggles)
466
509
  ipcMain.handle("toggle-recording", async () => {
467
510
  if (isRecording) stopRecording(); else startRecording();
@@ -18,14 +18,19 @@ contextBridge.exposeInMainWorld("apx", {
18
18
  // Check if the whisper model is loaded (false = still loading)
19
19
  checkWhisperReady: () => ipcRenderer.invoke("check-whisper-ready"),
20
20
 
21
+ // Keep STT warm (loads the model if idle + resets the idle timer). Called
22
+ // while the window is open / on mic-open so the first decode isn't cold.
23
+ warmupStt: () => ipcRenderer.invoke("warmup-stt").catch(() => ({ ok: false })),
24
+
21
25
  // Send final text to daemon
22
26
  sendMessage: (text, previousMessages) =>
23
27
  ipcRenderer.invoke("send-message", { text, previousMessages }),
24
28
 
25
- // After "done", ask main to synthesize TTS. Returns true if main will reply
26
- // with a tts-ready / tts-failed daemon-event; false if TTS is not wired.
27
- requestTts: (text) => {
28
- ipcRenderer.invoke("request-tts", { text }).catch(() => {});
29
+ // Ask main to synthesize TTS for one segment. `seg` correlates the resulting
30
+ // tts-ready/tts-failed event back to the bubble that requested it (each
31
+ // assistant message has its own audio). Returns true optimistically.
32
+ requestTts: (text, seg) => {
33
+ ipcRenderer.invoke("request-tts", { text, seg }).catch(() => {});
29
34
  return true; // optimistic; renderer waits for the event either way
30
35
  },
31
36
 
@@ -42,6 +47,7 @@ contextBridge.exposeInMainWorld("apx", {
42
47
  getTheme: () => ipcRenderer.invoke("get-theme"),
43
48
  getPosition: () => ipcRenderer.invoke("get-position"),
44
49
  getAgentName: () => ipcRenderer.invoke("get-agent-name"),
50
+ getVoiceTiming: () => ipcRenderer.invoke("get-voice-timing"),
45
51
 
46
52
  // Renderer asks main to resize the BrowserWindow to the rendered height
47
53
  resize: (height) => ipcRenderer.send("resize-window", { height }),
@@ -31,15 +31,49 @@
31
31
  let recorderFormat = "webm";
32
32
  let liveBusy = false;
33
33
 
34
+ // Mic is async to open (getUserMedia + recorder warm-up). Until it's actually
35
+ // capturing we show a "Cargando…" state instead of the wave, so the user
36
+ // doesn't talk into the dead gap before the recorder starts.
37
+ let micReady = false;
38
+
39
+ // Silence auto-send: once speech has been heard, SILENCE_MS of quiet
40
+ // auto-commits the recording. RMS (time-domain) is the voice/silence gate.
41
+ // Both are overridable from config.json (desktop.silence_ms / voice_rms).
42
+ let speechSeen = false;
43
+ let lastVoiceTs = 0;
44
+ let SILENCE_MS = 1200; // quiet after speech → send on its own
45
+ let VOICE_RMS = 0.025; // RMS above this counts as voice (0 = silence)
46
+ const PAUSE_PREVIEW_MS = 600; // a short pause kicks ONE decode (reused on send)
47
+
48
+ // When a pause triggers a preview decode, that decode already covers all the
49
+ // speech (the tail is just trailing silence), so the auto-send reuses it
50
+ // instead of paying a second full decode. These coordinate that handoff.
51
+ let pausePreviewed = false; // a preview decode fired for the current pause
52
+ let reuseLiveOnStop = false; // commit should reuse pendingUserText, not re-decode
53
+ let livePromise = null; // in-flight preview decode (awaited on reuse)
54
+
34
55
  // Web Audio analyser — drives the live capsule wave from real mic amplitude
35
56
  let audioCtx = null;
36
57
  let analyser = null;
37
58
  let freqData = null;
59
+ let timeData = null;
38
60
  let waveRaf = null;
39
61
 
40
- let streamingAgentEntry = null; // { id, role:'agent', el, ... } during thinking/speaking
41
- let toolPillsByName = {}; // active tool pills inside the streaming bubble row
42
- let ttsAudio = null; // <audio> playing the agent reply
62
+ let streamingAgentEntry = null; // legacy single-bubble streaming (kept dormant)
63
+ let toolPillsByName = {}; // active tool pills, by tool name, for the live turn
64
+ let ttsAudio = null; // <audio> currently playing
65
+
66
+ // ── Per-segment turn rendering ──────────────────────────────────────────
67
+ // A turn is now N agent message bubbles (intro, post-tool answer, …), each
68
+ // with its own audio. `currentTurn` tags every bubble of a turn so regen can
69
+ // drop the whole turn. The audio queue plays segment audios in seq order
70
+ // (gapless auto-play), waiting at the cursor for each segment's TTS to land.
71
+ let currentTurn = 0;
72
+ let turnAudios = []; // [{ m, ready, failed, played }] ordered by seq
73
+ let audioCursor = 0; // index of the next segment to play
74
+ let queuePlaying = false; // a segment audio is currently playing
75
+ let turnDone = false; // `done` received for the active turn
76
+ let turnWatchdog = null; // flushes the queue if a segment's TTS hangs
43
77
 
44
78
  let history = []; // [{role:'user'|'assistant', content}] sent to daemon for context
45
79
  let theme = "light";
@@ -119,10 +153,15 @@
119
153
  window.apx?.getPosition?.() ?? "right",
120
154
  window.apx?.getShortcut?.() ?? "CommandOrControl+G",
121
155
  window.apx?.getAgentName?.() ?? "Superagente",
122
- ]).then(([th, pos, shortcut, name]) => {
156
+ window.apx?.getVoiceTiming?.() ?? null,
157
+ ]).then(([th, pos, shortcut, name, timing]) => {
123
158
  theme = th || "light";
124
159
  position = pos || "right";
125
160
  agentName = (name && String(name).trim()) || "Superagente";
161
+ if (timing) {
162
+ if (typeof timing.silence_ms === "number") SILENCE_MS = timing.silence_ms;
163
+ if (typeof timing.voice_rms === "number") VOICE_RMS = timing.voice_rms;
164
+ }
126
165
  document.documentElement.setAttribute("data-theme", theme);
127
166
  setPosition(position);
128
167
  initialCaption(shortcut);
@@ -217,6 +256,13 @@
217
256
  }
218
257
  }
219
258
  // else: input already there → leave it alone (preserves focus + caret)
259
+ } else if (mode === "listening" && !micReady) {
260
+ // Mic still opening (getUserMedia + recorder warm-up). Show a loading
261
+ // status so the user waits for capture instead of talking into the gap.
262
+ if ($capCenter.dataset.mode !== "loading") {
263
+ $capCenter.dataset.mode = "loading";
264
+ $capCenter.innerHTML = `<span class="status"><span class="dots"><i></i><i></i><i></i></span><span class="shimmer">Cargando…</span></span>`;
265
+ }
220
266
  } else if (mode === "listening") {
221
267
  // Only rebuild the wave if it's not already there (avoids restarting
222
268
  // CSS animations / Web Audio binding every render).
@@ -246,9 +292,10 @@
246
292
  }
247
293
  }
248
294
  }
249
- // Clear data-mode when we're back to idle/listening so a future busy mode
250
- // re-renders correctly.
251
- if (mode === "idle" || mode === "listening") $capCenter.dataset.mode = "";
295
+ // Clear data-mode when we're back to idle, or once the live wave is up, so
296
+ // a future busy mode re-renders correctly. While the mic is still warming
297
+ // up we keep the "loading" marker so "Cargando…" isn't rebuilt every frame.
298
+ if (mode === "idle" || (mode === "listening" && micReady)) $capCenter.dataset.mode = "";
252
299
 
253
300
  // actions
254
301
  $capActions.innerHTML = "";
@@ -273,7 +320,8 @@
273
320
  }
274
321
  } else if (mode === "listening") {
275
322
  addBtn("ghost", "Cancelar", ICON.x(), () => cancel());
276
- addBtn("", "Enviar", ICON.send(), () => stopListening(/* commit */ true));
323
+ // No "Enviar" until the recorder is live nothing to send mid-warm-up.
324
+ if (micReady) addBtn("", "Enviar", ICON.send(), () => stopListening(/* commit */ true));
277
325
  } else if (mode === "transcribing") {
278
326
  addBtn("ghost", "Cancelar", ICON.x(), () => cancel());
279
327
  } else if (mode === "thinking") {
@@ -321,7 +369,6 @@
321
369
  // Re-render all existing turns
322
370
  messages.forEach((m, i) => appendTurn(m, i === messages.length - 1));
323
371
  if (mode === "transcribing") renderPendingUserPartial();
324
- if (mode === "thinking" || mode === "speaking") ensureStreamingAgentBubble();
325
372
  }
326
373
  }
327
374
 
@@ -388,7 +435,10 @@
388
435
  if (history.length && history[history.length - 1].role === "assistant") {
389
436
  history.pop();
390
437
  }
391
- messages = messages.filter((x) => x.id !== m.id);
438
+ // A turn can be several agent bubbles (intro + post-tool answer…); drop
439
+ // them all so regen replaces the whole turn, not just the last segment.
440
+ const turnId = m.turn;
441
+ messages = messages.filter((x) => !(x.role === "agent" && turnId != null && x.turn === turnId) && x.id !== m.id);
392
442
  rebuildConvFromState();
393
443
  startAgentTurn();
394
444
  sendToDaemon(lastUser.text);
@@ -499,12 +549,14 @@
499
549
  }
500
550
 
501
551
  function addToolPill(name) {
502
- ensureStreamingAgentBubble();
503
- if (toolPillsByName[name]) return;
552
+ ensureConv();
553
+ if (!$convScroll || toolPillsByName[name]) return;
504
554
  const pill = document.createElement("div");
505
555
  pill.className = "tool-pill";
506
556
  pill.innerHTML = `<div class="spinner"></div><span>${escapeHtml(name)}</span>`;
507
- $convScroll.insertBefore(pill, streamingAgentEntry.el);
557
+ // Append at the end of the conversation flow — pills sit between the
558
+ // segment bubbles in the order tools actually run.
559
+ $convScroll.appendChild(pill);
508
560
  toolPillsByName[name] = pill;
509
561
  scrollConvToBottom();
510
562
  }
@@ -541,53 +593,46 @@
541
593
  const dur = m.dur || 1;
542
594
  const fmt = (s) => `0:${String(Math.round(s)).padStart(2, "0")}`;
543
595
  const audio = new Audio(m.audio);
596
+ m._audioEl = audio; // the audio queue drives sequential playback
544
597
  let raf = null;
545
- let progress = 0;
546
598
 
547
599
  const setProgress = (p) => {
548
- progress = Math.max(0, Math.min(1, p));
549
- const cur = Math.floor(progress * N);
600
+ p = Math.max(0, Math.min(1, p));
601
+ const cur = Math.floor(p * N);
550
602
  bars.forEach((b, i) => {
551
603
  b.classList.toggle("on", i <= cur);
552
604
  b.classList.toggle("cur", i === cur && !audio.paused);
553
605
  });
554
- $dur.textContent = progress > 0 || !audio.paused ? fmt(progress * dur) : fmt(dur);
606
+ $dur.textContent = p > 0 || !audio.paused ? fmt(p * dur) : fmt(dur);
555
607
  };
556
-
557
608
  const tick = () => {
558
609
  if (audio.duration > 0) setProgress(audio.currentTime / audio.duration);
559
610
  raf = requestAnimationFrame(tick);
560
611
  };
561
- audio.addEventListener("play", () => { $play.innerHTML = ICON.pause(); raf = requestAnimationFrame(tick); mode = "speaking"; render(); });
562
- audio.addEventListener("pause", () => { $play.innerHTML = ICON.play(); if (raf) cancelAnimationFrame(raf); if (mode === "speaking") { mode = "idle"; render(); } });
563
- audio.addEventListener("ended", () => { setProgress(1); if (mode === "speaking") { mode = "idle"; render(); } });
612
+ audio.addEventListener("play", () => { $play.innerHTML = ICON.pause(); raf = requestAnimationFrame(tick); if (mode !== "speaking") { mode = "speaking"; render(); } });
613
+ audio.addEventListener("pause", () => { $play.innerHTML = ICON.play(); if (raf) cancelAnimationFrame(raf); });
614
+ audio.addEventListener("ended", () => { $play.innerHTML = ICON.play(); if (raf) cancelAnimationFrame(raf); setProgress(1); onSegmentEnded(m); });
615
+ // 404 / decode error / autoplay block: don't hang — advance the queue.
616
+ audio.addEventListener("error", () => onSegmentEnded(m));
564
617
 
565
- $play.addEventListener("click", () => audio.paused ? audio.play() : audio.pause());
618
+ $play.addEventListener("click", () => {
619
+ if (audio.paused) {
620
+ // Manual play takes control — stop the auto-sequence so we don't fight it.
621
+ queuePlaying = false;
622
+ try { if (ttsAudio && ttsAudio !== audio && !ttsAudio.ended) ttsAudio.pause(); } catch {}
623
+ ttsAudio = audio;
624
+ audio.play().catch(() => { if (mode === "speaking") { mode = "idle"; render(); } });
625
+ } else {
626
+ audio.pause();
627
+ if (mode === "speaking") { mode = "idle"; render(); }
628
+ }
629
+ });
566
630
  $bar.addEventListener("click", (e) => {
567
631
  const r = $bar.getBoundingClientRect();
568
632
  const p = Math.max(0, Math.min(1, (e.clientX - r.left) / r.width));
569
633
  if (audio.duration > 0) audio.currentTime = p * audio.duration;
570
634
  setProgress(p);
571
635
  });
572
-
573
- // If the audio errors out (404, decode error, autoplay block, etc) make
574
- // sure the capsule doesn't stay stuck in "está hablando…".
575
- audio.addEventListener("error", () => {
576
- if (mode === "speaking") { mode = "idle"; render(); }
577
- });
578
-
579
- // autoplay if it's the fresh reply
580
- if (m.fresh) {
581
- m.fresh = false;
582
- ttsAudio?.pause?.();
583
- ttsAudio = audio;
584
- audio.play().catch(() => {
585
- // Autoplay block (rare in Electron with user-gesture but possible
586
- // when the window has never been focused). Bail out so the capsule
587
- // returns to idle and the user can still tap "play" on the scrubber.
588
- if (mode === "speaking" || mode === "thinking") { mode = "idle"; render(); }
589
- });
590
- }
591
636
  }
592
637
 
593
638
  // Post-finalize hook: add a scrubber to an already-rendered agent turn
@@ -597,15 +642,18 @@
597
642
  if (!m) return;
598
643
  m.audio = url;
599
644
  m.dur = dur || 0;
600
- m.fresh = true; // autoplay the freshly-arrived reply
601
645
  const turnEl = $convScroll?.querySelector(`[data-id="${turnId}"]`);
602
- if (!turnEl) return;
603
- // Insert the scrubber HTML just before turn-actions (matches appendTurn order).
604
- const actions = turnEl.querySelector(".turn-actions");
605
- const html = buildScrubberHtml(m);
606
- if (actions) actions.insertAdjacentHTML("beforebegin", html);
607
- else turnEl.insertAdjacentHTML("beforeend", html);
608
- wireScrubber(turnEl, m);
646
+ if (turnEl && !turnEl.querySelector(".audio")) {
647
+ // Insert the scrubber HTML just before turn-actions (matches appendTurn).
648
+ const actions = turnEl.querySelector(".turn-actions");
649
+ const html = buildScrubberHtml(m);
650
+ if (actions) actions.insertAdjacentHTML("beforebegin", html);
651
+ else turnEl.insertAdjacentHTML("beforeend", html);
652
+ wireScrubber(turnEl, m); // sets m._audioEl
653
+ }
654
+ // Audio is ready → let the sequential queue play it when it's this
655
+ // segment's turn (gapless auto-play across the turn's bubbles).
656
+ queueMarkReady(m);
609
657
  scrollConvToBottom();
610
658
  }
611
659
 
@@ -620,10 +668,94 @@
620
668
  return out;
621
669
  }
622
670
 
671
+ // ── Per-turn setup + sequential audio queue ──────────────────────────────
672
+ // Each turn renders N agent bubbles (segments), each with its own audio. We
673
+ // play those audios in `seq` order, gaplessly: the cursor waits at a segment
674
+ // until its TTS lands, plays it, then advances. So Roby "speaks" its messages
675
+ // one after another even though they synthesize at different speeds.
676
+ function beginAgentTurn() {
677
+ currentTurn++;
678
+ resetTurnAudio();
679
+ doneHandled = false;
680
+ pendingTtsTurnId = null;
681
+ if (ttsTimer) { clearTimeout(ttsTimer); ttsTimer = null; }
682
+ }
683
+ function resetTurnAudio() {
684
+ try { ttsAudio?.pause?.(); } catch {}
685
+ ttsAudio = null;
686
+ turnAudios = [];
687
+ audioCursor = 0;
688
+ queuePlaying = false;
689
+ turnDone = false;
690
+ if (turnWatchdog) { clearTimeout(turnWatchdog); turnWatchdog = null; }
691
+ }
692
+ function queueRegisterSegment(m) {
693
+ if (!turnAudios.some((e) => e.m === m)) {
694
+ turnAudios.push({ m, ready: false, failed: false, played: false });
695
+ turnAudios.sort((a, b) => (a.m.seq || 0) - (b.m.seq || 0));
696
+ }
697
+ }
698
+ function queueMarkReady(m) {
699
+ const e = turnAudios.find((x) => x.m === m);
700
+ if (e) e.ready = true;
701
+ pumpAudioQueue();
702
+ }
703
+ function queueMarkFailed(m) {
704
+ const e = turnAudios.find((x) => x.m === m);
705
+ if (e) { e.ready = true; e.failed = true; e.played = true; }
706
+ pumpAudioQueue();
707
+ }
708
+ function pumpAudioQueue() {
709
+ if (queuePlaying) return;
710
+ while (audioCursor < turnAudios.length) {
711
+ const e = turnAudios[audioCursor];
712
+ if (!e.ready) return; // wait for this segment's TTS
713
+ if (e.played || e.failed || !e.m._audioEl) { audioCursor++; continue; }
714
+ const audio = e.m._audioEl;
715
+ queuePlaying = true;
716
+ try { if (ttsAudio && ttsAudio !== audio && !ttsAudio.ended) ttsAudio.pause(); } catch {}
717
+ ttsAudio = audio;
718
+ audio.play().catch(() => { // autoplay blocked / decode error
719
+ queuePlaying = false;
720
+ e.played = true;
721
+ audioCursor++;
722
+ pumpAudioQueue();
723
+ });
724
+ return;
725
+ }
726
+ // Drained. Once the turn is done and nothing's left, return to idle.
727
+ if (turnDone) {
728
+ if (turnWatchdog) { clearTimeout(turnWatchdog); turnWatchdog = null; }
729
+ if (mode === "speaking" || mode === "thinking") { mode = "idle"; render(); }
730
+ }
731
+ }
732
+ // Called from a segment audio's `ended` (or `error`). Advances the queue.
733
+ function onSegmentEnded(m) {
734
+ const e = turnAudios.find((x) => x.m === m);
735
+ if (e) { if (e.played) return; e.played = true; }
736
+ if (queuePlaying && ttsAudio === m._audioEl) {
737
+ queuePlaying = false;
738
+ audioCursor++;
739
+ pumpAudioQueue();
740
+ } else if (mode === "speaking") {
741
+ mode = "idle"; render();
742
+ }
743
+ }
744
+
623
745
  // ── Recording flow ───────────────────────────────────────────────────────
624
746
  function startListening() {
625
747
  if (mode !== "idle") return;
626
748
  isCancelled = false;
749
+ micReady = false; // show "Cargando…" until the recorder is actually live
750
+ speechSeen = false;
751
+ lastVoiceTs = 0;
752
+ pausePreviewed = false;
753
+ reuseLiveOnStop = false;
754
+ livePromise = null;
755
+ pendingUserText = "";
756
+ // Warm the whisper model now (overlaps the mic warm-up), so the decode at
757
+ // the end of this utterance doesn't pay a cold start.
758
+ window.apx?.warmupStt?.();
627
759
  mode = "listening";
628
760
  render();
629
761
  startMic();
@@ -649,6 +781,7 @@
649
781
  if (mode === "listening") { stopMic(); }
650
782
  if (mode === "thinking" || mode === "speaking") { window.apx?.cancel?.(); }
651
783
  removePendingUserPartial();
784
+ resetTurnAudio(); // stop any playing/queued segment audio
652
785
  if (streamingAgentEntry) {
653
786
  streamingAgentEntry.el.remove();
654
787
  streamingAgentEntry = null;
@@ -658,6 +791,8 @@
658
791
  }
659
792
 
660
793
  function stopSpeaking() {
794
+ // Halt the auto-sequence and the current segment.
795
+ queuePlaying = false;
661
796
  try { ttsAudio?.pause?.(); } catch {}
662
797
  if (mode === "speaking") { mode = "idle"; render(); }
663
798
  }
@@ -678,6 +813,7 @@
678
813
  analyser.maxDecibels = -15; // ceiling (loud speech)
679
814
  src.connect(analyser);
680
815
  freqData = new Uint8Array(analyser.frequencyBinCount);
816
+ timeData = new Uint8Array(analyser.fftSize);
681
817
  startWaveLoop();
682
818
  } catch (e) {
683
819
  console.warn("desktop renderer: AnalyserNode init failed", e);
@@ -691,14 +827,27 @@
691
827
  recordedChunks = [];
692
828
  mediaRecorder = new MediaRecorder(audioStream, { mimeType, audioBitsPerSecond: 32000 });
693
829
  mediaRecorder.ondataavailable = (e) => {
830
+ // Just buffer. We deliberately do NOT decode on every chunk anymore —
831
+ // re-decoding the growing clip every 2s serialized on the single
832
+ // whisper thread and the final decode queued behind it (the old ~10s
833
+ // stall). Transcription now happens once, on a pause / on stop.
694
834
  if (e.data && e.data.size > 0) recordedChunks.push(e.data);
695
- runLivePartial();
696
835
  };
697
836
  mediaRecorder.onstop = async () => {
698
837
  if (isCancelled) { recordedChunks = []; if (mode !== "idle") { mode = "idle"; render(); } return; }
699
- const raw = await transcribeBuffered();
700
- const text = (raw || "").trim();
838
+ let text = "";
839
+ // Auto-send after a pause: the pause already kicked a full decode that
840
+ // covers all the speech (the only thing after it is trailing silence),
841
+ // so reuse it instead of decoding the same audio again. Await the
842
+ // in-flight preview if it hasn't settled yet.
843
+ if (reuseLiveOnStop) {
844
+ if (livePromise) { try { await livePromise; } catch {} }
845
+ text = (pendingUserText || "").trim();
846
+ }
847
+ // Manual send (Enviar / ⌘G release) or no preview yet → one fresh decode.
848
+ if (!text) text = (await transcribeBuffered()).trim();
701
849
  recordedChunks = [];
850
+ reuseLiveOnStop = false;
702
851
  // Guard with .trim() — whisper occasionally returns a single space or
703
852
  // newline for very short clips, which used to commit an empty bubble.
704
853
  if (!text || isCancelled) {
@@ -711,7 +860,16 @@
711
860
  pendingUserText = text;
712
861
  commitUserMessage(text, /* via */ "voice");
713
862
  };
714
- mediaRecorder.start(2000);
863
+ // 1s timeslice: chunks land often enough that a pause-preview decode has
864
+ // audio to work with even for short utterances. We no longer decode per
865
+ // chunk (just buffer), so a smaller slice is essentially free.
866
+ mediaRecorder.start(1000);
867
+ // Recorder is now live → swap "Cargando…" for the reactive wave and let
868
+ // silence detection arm. lastVoiceTs starts now so a fully silent open
869
+ // won't auto-send (speechSeen gates that).
870
+ micReady = true;
871
+ lastVoiceTs = Date.now();
872
+ if (mode === "listening") render();
715
873
  } catch (e) {
716
874
  console.error("desktop renderer: mic error", e);
717
875
  mode = "idle";
@@ -723,11 +881,16 @@
723
881
  try { audioStream?.getTracks().forEach((t) => t.stop()); } catch {}
724
882
  mediaRecorder = null;
725
883
  audioStream = null;
884
+ micReady = false;
885
+ speechSeen = false;
886
+ lastVoiceTs = 0;
887
+ pausePreviewed = false;
726
888
  stopWaveLoop();
727
889
  try { audioCtx?.close(); } catch {}
728
890
  audioCtx = null;
729
891
  analyser = null;
730
892
  freqData = null;
893
+ timeData = null;
731
894
  }
732
895
 
733
896
  // ── Reactive wave: amplitude-driven bar heights (runs while mode === listening)
@@ -738,6 +901,43 @@
738
901
  const tick = () => {
739
902
  if (mode !== "listening" || !analyser) { waveRaf = null; return; }
740
903
  analyser.getByteFrequencyData(freqData);
904
+
905
+ // ── Silence auto-send ──────────────────────────────────────────────
906
+ // Time-domain RMS is a reliable voice/silence gate (unlike the freq
907
+ // bars, it's independent of the analyser's dB scaling). Once we've heard
908
+ // speech, SILENCE_MS of quiet commits the recording on its own.
909
+ if (micReady && timeData) {
910
+ analyser.getByteTimeDomainData(timeData);
911
+ let sumSq = 0;
912
+ for (let i = 0; i < timeData.length; i++) {
913
+ const v = (timeData[i] - 128) / 128;
914
+ sumSq += v * v;
915
+ }
916
+ const rms = Math.sqrt(sumSq / timeData.length);
917
+ const now = Date.now();
918
+ if (rms > VOICE_RMS) {
919
+ speechSeen = true;
920
+ lastVoiceTs = now;
921
+ pausePreviewed = false; // new speech → allow a fresh preview
922
+ } else if (speechSeen && lastVoiceTs) {
923
+ const silentFor = now - lastVoiceTs;
924
+ // A short pause kicks ONE decode of everything said so far. It doubles
925
+ // as the final transcription, so the auto-send below is instant
926
+ // instead of paying a decode after stop.
927
+ if (!pausePreviewed && silentFor >= PAUSE_PREVIEW_MS && !liveBusy) {
928
+ pausePreviewed = true;
929
+ runLivePartial();
930
+ }
931
+ // Sustained silence → auto-send, reusing the pause decode.
932
+ if (silentFor >= SILENCE_MS) {
933
+ waveRaf = null;
934
+ reuseLiveOnStop = true;
935
+ stopListening(/* commit */ true);
936
+ return;
937
+ }
938
+ }
939
+ }
940
+
741
941
  const wave = $capCenter.querySelector(".cap-wave");
742
942
  if (wave) {
743
943
  const bars = wave.children;
@@ -781,17 +981,20 @@
781
981
  } catch {}
782
982
  return "";
783
983
  }
784
- async function runLivePartial() {
984
+ // Decode what's been recorded so far (fired once per speech pause). The
985
+ // result is stashed in pendingUserText and reused by the auto-send on stop,
986
+ // so the same audio is never decoded twice. livePromise lets onstop await an
987
+ // in-flight decode before reading the text.
988
+ function runLivePartial() {
785
989
  if (liveBusy || mode !== "listening" || !recordedChunks.length) return;
786
990
  liveBusy = true;
787
- try {
788
- const text = await transcribeBuffered();
789
- if (text && mode === "listening") {
790
- pendingUserText = text;
791
- // No visible live preview in the capsule wave mode; update is mostly
792
- // useful for the conv pending-user partial during transcribing.
793
- }
794
- } finally { liveBusy = false; }
991
+ livePromise = (async () => {
992
+ try {
993
+ const text = await transcribeBuffered();
994
+ if (text && mode === "listening") pendingUserText = text;
995
+ } finally { liveBusy = false; }
996
+ })();
997
+ return livePromise;
795
998
  }
796
999
 
797
1000
  // ── Send: text path + post-transcription commit path ─────────────────────
@@ -819,12 +1022,10 @@
819
1022
  // one ResizeObserver tick later). Shared by commitUserMessage + regen so
820
1023
  // both paths set up the daemon-event pipeline identically.
821
1024
  function startAgentTurn() {
822
- doneHandled = false;
823
- pendingTtsTurnId = null;
824
- if (ttsTimer) { clearTimeout(ttsTimer); ttsTimer = null; }
1025
+ beginAgentTurn(); // bump currentTurn + reset the audio queue/guards
825
1026
  mode = "thinking";
826
1027
  render();
827
- ensureStreamingAgentBubble();
1028
+ ensureConv(); // segments will mount their own bubbles
828
1029
  requestWindowResize();
829
1030
  }
830
1031
 
@@ -861,73 +1062,88 @@
861
1062
  window.apx?.onDaemonEvent?.((msg) => {
862
1063
  switch (msg.type) {
863
1064
  case "thinking":
864
- if (mode !== "thinking" && mode !== "speaking") { mode = "thinking"; render(); }
865
- ensureStreamingAgentBubble();
1065
+ // Marks the start of a turn. For locally-initiated turns startAgentTurn
1066
+ // already ran beginAgentTurn() (mode is already "thinking"); for turns
1067
+ // NOT initiated in this window (injected / broadcast from another client)
1068
+ // we set them up here so currentTurn/queue/doneHandled are correct and
1069
+ // the turn doesn't hang.
1070
+ if (mode !== "thinking" && mode !== "speaking") {
1071
+ beginAgentTurn();
1072
+ mode = "thinking";
1073
+ render();
1074
+ } else {
1075
+ doneHandled = false;
1076
+ }
1077
+ ensureConv();
866
1078
  break;
867
1079
  case "token":
1080
+ // Legacy path (backend no longer streams tokens for desktop). Kept so a
1081
+ // mixed-version daemon doesn't break — accumulate into a single bubble.
868
1082
  appendStreamingToken(msg.text || "");
869
1083
  break;
870
1084
  case "tool_start": addToolPill(msg.name); break;
871
1085
  case "tool_done": updateToolPill(msg.name); break;
1086
+ case "segment": {
1087
+ // Each segment is its own agent message bubble + its own audio.
1088
+ ensureConv();
1089
+ const text = (msg.text || "").trim();
1090
+ if (!text) break;
1091
+ const id = nextId++;
1092
+ const m = { id, seq: msg.seq || 0, turn: currentTurn, role: "agent", text, t: nowHHMM(), audio: null, dur: null };
1093
+ messages.push(m);
1094
+ appendTurn(m, true);
1095
+ queueRegisterSegment(m);
1096
+ // Synthesize THIS segment; tts-ready(seg=id) attaches its audio + queues
1097
+ // it for gapless sequential playback.
1098
+ window.apx?.requestTts?.(text, id);
1099
+ requestWindowResize();
1100
+ scrollConvToBottom();
1101
+ break;
1102
+ }
872
1103
  case "done": {
873
- // Daemon may emit `done` twice (retry/race). Process only once per turn.
874
1104
  if (doneHandled) break;
875
1105
  doneHandled = true;
876
- const finalText = msg.text || streamingAgentEntry?.text || "";
877
- // CRITICAL: many models (gemini-flash, groq-fast tier) don't stream
878
- // tokens they send the whole reply in `done`. Without this branch
879
- // the bubble stays with just the dots placeholder until TTS resolves
880
- // (or 6s timeout), which feels broken. Inject the text NOW so the
881
- // user sees the reply immediately.
882
- if (streamingAgentEntry) {
883
- streamingAgentEntry.text = finalText;
884
- if (!streamingAgentEntry.started && finalText) {
885
- streamingAgentEntry.started = true;
886
- streamingAgentEntry.msgEl.innerHTML = formatWordsHtml(finalText);
887
- scrollConvToBottom();
888
- }
889
- }
890
- // Finalize and return to idle right away so the capsule frees up.
891
- // TTS runs in the background; tts-ready will attach the scrubber to
892
- // the already-rendered turn (see attachAudioToLastAgentTurn below).
893
- const finalizedTurnId = streamingAgentEntry?.id;
894
- finalizeStreamingAgent();
895
- mode = "idle"; render();
896
- // Fire-and-forget TTS request. If it returns audio, attach it to
897
- // the turn we just rendered; if it errors / times out / never replies,
898
- // no big deal — the user already has the text. Guard with a 6s soft
899
- // timeout so a stuck request doesn't hold ttsTimer state.
900
- const handled = window.apx?.requestTts?.(finalText);
901
- if (handled) {
902
- if (ttsTimer) clearTimeout(ttsTimer);
903
- ttsTimer = setTimeout(() => { ttsTimer = null; }, 6000);
904
- // Remember which turn the next tts-ready/failed belongs to.
905
- pendingTtsTurnId = finalizedTurnId || null;
1106
+ turnDone = true;
1107
+ // Record the whole turn as one assistant entry for conversation context.
1108
+ const full = (msg.text || "").trim();
1109
+ if (full) history.push({ role: "assistant", content: full });
1110
+ // Safety net: if some segment's TTS never resolves, flush after 12s so
1111
+ // the capsule can't get stuck in "Pensando…".
1112
+ if (turnWatchdog) clearTimeout(turnWatchdog);
1113
+ turnWatchdog = setTimeout(() => {
1114
+ turnAudios.forEach((e) => { if (!e.ready) { e.ready = true; e.failed = true; e.played = true; } });
1115
+ pumpAudioQueue();
1116
+ }, 12000);
1117
+ // Play whatever audio is already ready; flip to idle if there's nothing
1118
+ // left to play (e.g. a turn that produced no audio).
1119
+ pumpAudioQueue();
1120
+ if (!queuePlaying && audioCursor >= turnAudios.length && mode !== "speaking") {
1121
+ mode = "idle"; render();
906
1122
  }
907
1123
  break;
908
1124
  }
909
- case "tts-ready": {
910
- if (ttsTimer) { clearTimeout(ttsTimer); ttsTimer = null; }
911
- if (pendingTtsTurnId != null) {
912
- attachAudioToTurn(pendingTtsTurnId, { url: msg.url, dur: msg.duration });
913
- pendingTtsTurnId = null;
914
- }
1125
+ case "tts-ready":
1126
+ if (msg.seg != null) attachAudioToTurn(msg.seg, { url: msg.url, dur: msg.duration });
915
1127
  break;
916
- }
917
- case "tts-failed":
918
- // The text is already on screen; just clean up the timer + pending id.
919
- if (ttsTimer) { clearTimeout(ttsTimer); ttsTimer = null; }
920
- pendingTtsTurnId = null;
1128
+ case "tts-failed": {
1129
+ // No audio for this segment — skip it in the queue so playback advances.
1130
+ const m = (msg.seg != null) ? messages.find((x) => x.id === msg.seg) : null;
1131
+ if (m) queueMarkFailed(m);
921
1132
  break;
922
- case "error":
923
- finalizeStreamingAgentError(msg.message || "Unknown error");
1133
+ }
1134
+ case "error": {
1135
+ ensureConv();
1136
+ const id = nextId++;
1137
+ const m = { id, seq: 9999, turn: currentTurn, role: "agent", text: "Error: " + (msg.message || "Unknown error"), t: nowHHMM(), isError: true };
1138
+ messages.push(m);
1139
+ appendTurn(m, true);
1140
+ turnDone = true;
1141
+ if (mode !== "speaking") { mode = "idle"; render(); }
924
1142
  break;
1143
+ }
925
1144
  case "cancelled":
926
- if (streamingAgentEntry) {
927
- if (!streamingAgentEntry.text) streamingAgentEntry.el.remove();
928
- else finalizeStreamingAgent();
929
- streamingAgentEntry = null;
930
- }
1145
+ resetTurnAudio();
1146
+ turnDone = true;
931
1147
  mode = "idle"; render();
932
1148
  break;
933
1149
  }
@@ -949,8 +1165,20 @@
949
1165
  document.addEventListener("keydown", (e) => {
950
1166
  if (e.key === "Escape") {
951
1167
  e.preventDefault();
952
- if (mode === "listening" || mode === "transcribing" || mode === "thinking" || mode === "speaking") cancel();
953
- else closeWindow();
1168
+ // Escape cancels whatever is in flight (recording / transcribing /
1169
+ // thinking / speaking). If nothing is in flight, a half-typed draft is
1170
+ // cleared first; only an empty idle capsule closes the window.
1171
+ if (mode === "listening" || mode === "transcribing" || mode === "thinking" || mode === "speaking") {
1172
+ cancel();
1173
+ return;
1174
+ }
1175
+ const input = $capCenter.querySelector("input");
1176
+ if (input && input.value.trim()) {
1177
+ input.value = "";
1178
+ render();
1179
+ } else {
1180
+ closeWindow();
1181
+ }
954
1182
  }
955
1183
  });
956
1184
 
@@ -975,6 +1203,13 @@
975
1203
  setInterval(requestWindowResize, 250);
976
1204
  }
977
1205
 
1206
+ // ── Keep STT warm ────────────────────────────────────────────────────────
1207
+ // The whisper server idles out after ~10 min. While the desktop window is
1208
+ // running we ping it every 4 min (and once now) so it stays loaded — the
1209
+ // user's first utterance never pays the cold-load cost.
1210
+ window.apx?.warmupStt?.();
1211
+ setInterval(() => { window.apx?.warmupStt?.(); }, 4 * 60 * 1000);
1212
+
978
1213
  // ── Helpers ──────────────────────────────────────────────────────────────
979
1214
  function nowHHMM() {
980
1215
  const d = new Date();