@agentprojectcontext/apx 1.27.2 → 1.28.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/host/daemon/api/transcribe.js +12 -0
- package/src/host/daemon/plugins/desktop.js +34 -12
- package/src/host/daemon/transcription.js +29 -0
- package/src/host/daemon/whisper-server.py +11 -0
- package/src/interfaces/desktop/main.js +48 -5
- package/src/interfaces/desktop/preload.js +10 -4
- package/src/interfaces/desktop/renderer.js +354 -119
package/package.json
CHANGED
|
@@ -6,6 +6,18 @@
|
|
|
6
6
|
//
|
|
7
7
|
// Shared by overlay, telegram voice messages, and any external caller.
|
|
8
8
|
export function register(app) {
|
|
9
|
+
// GET /transcribe/warmup — load the local whisper model (if needed) and reset
|
|
10
|
+
// its idle watchdog. Callers (e.g. the desktop window) ping this while open so
|
|
11
|
+
// the first real utterance doesn't pay the cold-load cost.
|
|
12
|
+
app.get("/transcribe/warmup", async (_req, res) => {
|
|
13
|
+
try {
|
|
14
|
+
const { warmupWhisper } = await import("../transcription.js");
|
|
15
|
+
res.json(await warmupWhisper());
|
|
16
|
+
} catch (e) {
|
|
17
|
+
res.status(500).json({ ok: false, error: e.message });
|
|
18
|
+
}
|
|
19
|
+
});
|
|
20
|
+
|
|
9
21
|
app.post("/transcribe/chunk", async (req, res) => {
|
|
10
22
|
const chunks = [];
|
|
11
23
|
req.on("data", (c) => chunks.push(c));
|
|
@@ -100,9 +100,27 @@ async function _handleMessage({ ws, text, previousMessages }, { projects, config
|
|
|
100
100
|
await appendGlobalMessage({ channel: CHANNEL, direction: "in", type: "user", author: "user", body: text });
|
|
101
101
|
} catch {}
|
|
102
102
|
|
|
103
|
-
let fullResponse = "";
|
|
104
103
|
let toolsExecuted = [];
|
|
105
104
|
|
|
105
|
+
// Per-segment streaming: instead of merging the whole turn into one blob, we
|
|
106
|
+
// emit each assistant text piece as its own `segment` (an intro before a tool,
|
|
107
|
+
// then the post-tool answer, …). The renderer renders each as its own bubble
|
|
108
|
+
// and synthesizes its own audio, so a multi-step reply reads as separate spoken
|
|
109
|
+
// messages instead of one run-on bubble. `liveBuf` accumulates streamed tokens
|
|
110
|
+
// (streaming engines) so they can be flushed as a segment at each boundary;
|
|
111
|
+
// for non-streaming models like gemini the text arrives whole via events.
|
|
112
|
+
let segSeq = 0;
|
|
113
|
+
let lastSegText = "";
|
|
114
|
+
let liveBuf = "";
|
|
115
|
+
const emittedSegments = [];
|
|
116
|
+
const emitSegment = (raw) => {
|
|
117
|
+
const seg = (raw || "").trim();
|
|
118
|
+
if (!seg || seg === lastSegText) return;
|
|
119
|
+
lastSegText = seg;
|
|
120
|
+
emittedSegments.push(seg);
|
|
121
|
+
_send(ws, { type: "segment", seq: ++segSeq, text: seg });
|
|
122
|
+
};
|
|
123
|
+
|
|
106
124
|
try {
|
|
107
125
|
if (!isSuperAgentEnabled(config)) {
|
|
108
126
|
throw new Error("super-agent not enabled — set super_agent.enabled + super_agent.model in ~/.apx/config.json");
|
|
@@ -120,10 +138,7 @@ async function _handleMessage({ ws, text, previousMessages }, { projects, config
|
|
|
120
138
|
previousMessages: history.slice(0, -1),
|
|
121
139
|
overrideModel: cfg.model || null,
|
|
122
140
|
signal: controller.signal,
|
|
123
|
-
onToken: (chunk) => {
|
|
124
|
-
fullResponse += chunk;
|
|
125
|
-
_send(ws, { type: "token", text: chunk });
|
|
126
|
-
},
|
|
141
|
+
onToken: (chunk) => { liveBuf += chunk; },
|
|
127
142
|
onEvent: async (event) => {
|
|
128
143
|
if (event.type === "tool_start") {
|
|
129
144
|
const t = event.trace;
|
|
@@ -131,17 +146,24 @@ async function _handleMessage({ ws, text, previousMessages }, { projects, config
|
|
|
131
146
|
_send(ws, { type: "tool_start", name: t.tool, args: t.args });
|
|
132
147
|
} else if (event.type === "tool_result") {
|
|
133
148
|
_send(ws, { type: "tool_done", name: event.trace.tool });
|
|
134
|
-
} else if (event.type === "assistant_text" && event.text
|
|
135
|
-
|
|
136
|
-
|
|
149
|
+
} else if (event.type === "assistant_text" && event.text) {
|
|
150
|
+
// A complete assistant text segment (e.g. the "I'll check…" intro
|
|
151
|
+
// emitted right before a tool runs). Ship it as its own message.
|
|
152
|
+
emitSegment(event.text);
|
|
153
|
+
liveBuf = "";
|
|
137
154
|
}
|
|
138
155
|
},
|
|
139
156
|
});
|
|
140
|
-
|
|
141
|
-
|
|
157
|
+
// The final (no-tool) iteration's answer appears ONLY in result.text (or, for
|
|
158
|
+
// streaming engines, in liveBuf) — it's never emitted as an event. Ship it as
|
|
159
|
+
// the closing segment (deduped against the last one).
|
|
160
|
+
emitSegment((result.text || "").trim() || liveBuf.trim());
|
|
161
|
+
|
|
162
|
+
const finalText = emittedSegments.join("\n\n");
|
|
163
|
+
log(`desktop: super-agent turn done in ${Date.now() - t0}ms segments=${segSeq} text_len=${finalText.length} tools=${toolsExecuted.length}`);
|
|
142
164
|
|
|
143
|
-
//
|
|
144
|
-
_send(ws, { type: "done", text: finalText });
|
|
165
|
+
// Turn end. `segments` lets the renderer know how many bubbles to expect.
|
|
166
|
+
_send(ws, { type: "done", segments: segSeq, text: finalText });
|
|
145
167
|
|
|
146
168
|
// Append assistant turn to history
|
|
147
169
|
if (ws && histories) {
|
|
@@ -481,6 +481,35 @@ export async function preloadWhisperServer(log = console.log) {
|
|
|
481
481
|
}
|
|
482
482
|
}
|
|
483
483
|
|
|
484
|
+
/**
|
|
485
|
+
* Keep the local whisper server warm. Ensures it's loaded and pings /health,
|
|
486
|
+
* which resets the server's idle watchdog so a live session (e.g. the desktop
|
|
487
|
+
* window held open) never pays the cold-load cost on the next utterance.
|
|
488
|
+
* Cheap and safe to call repeatedly. Never throws.
|
|
489
|
+
* Returns { ok, model?, loaded?, provider } for the caller to surface.
|
|
490
|
+
*/
|
|
491
|
+
export async function warmupWhisper() {
|
|
492
|
+
try {
|
|
493
|
+
const cfg = await getConfig();
|
|
494
|
+
if (cfg.provider === "openai") return { ok: true, provider: "openai", loaded: false };
|
|
495
|
+
await ensureWhisperServer(cfg.local);
|
|
496
|
+
// /warmup loads the model into RAM (lazy otherwise) AND touches _last_used,
|
|
497
|
+
// resetting the idle timer. First call may block ~15-30s on a cold model;
|
|
498
|
+
// instant once warm. Generous timeout so the cold load can finish.
|
|
499
|
+
let loaded = false;
|
|
500
|
+
try {
|
|
501
|
+
const r = await fetch(`http://127.0.0.1:${WHISPER_PORT}/warmup`, {
|
|
502
|
+
signal: AbortSignal.timeout(40_000),
|
|
503
|
+
});
|
|
504
|
+
const j = await r.json().catch(() => ({}));
|
|
505
|
+
loaded = !!j.loaded;
|
|
506
|
+
} catch {}
|
|
507
|
+
return { ok: true, provider: "local", model: _serverModel, loaded };
|
|
508
|
+
} catch (e) {
|
|
509
|
+
return { ok: false, error: e.message };
|
|
510
|
+
}
|
|
511
|
+
}
|
|
512
|
+
|
|
484
513
|
/**
|
|
485
514
|
* Stop the whisper server we own (no-op if we adopted an external one).
|
|
486
515
|
*/
|
|
@@ -94,6 +94,17 @@ class _Handler(BaseHTTPRequestHandler):
|
|
|
94
94
|
"model": _model_name or _Handler.model_name,
|
|
95
95
|
"loaded": _model is not None,
|
|
96
96
|
})
|
|
97
|
+
elif self.path == "/warmup":
|
|
98
|
+
# Eagerly load the model into RAM (no audio needed) and reset the
|
|
99
|
+
# idle timer, so the first real transcription isn't cold. Blocks
|
|
100
|
+
# until the model is loaded the first time; instant once warm.
|
|
101
|
+
_touch()
|
|
102
|
+
with _model_lock:
|
|
103
|
+
try:
|
|
104
|
+
_load_model_if_needed(_Handler.model_name, _Handler.device, _Handler.compute_type)
|
|
105
|
+
self._send_json(200, {"ok": True, "loaded": _model is not None, "model": _model_name})
|
|
106
|
+
except Exception as e:
|
|
107
|
+
self._send_json(500, {"ok": False, "error": f"model load failed: {e}"})
|
|
97
108
|
else:
|
|
98
109
|
self._send_json(404, {"ok": False, "error": "not found"})
|
|
99
110
|
|
|
@@ -46,6 +46,20 @@ function getShortcut() {
|
|
|
46
46
|
return cfg?.desktop?.shortcut || cfg?.overlay?.shortcut || DEFAULT_SHORTCUT;
|
|
47
47
|
}
|
|
48
48
|
|
|
49
|
+
// Voice-capture timing for the listening capsule. Overridable in config.json:
|
|
50
|
+
// "desktop": { "silence_ms": 1200, "voice_rms": 0.025 }
|
|
51
|
+
// silence_ms — quiet after speech before auto-send. voice_rms — RMS above
|
|
52
|
+
// which audio counts as voice (lower = more sensitive).
|
|
53
|
+
function getVoiceTiming() {
|
|
54
|
+
const cfg = readApxConfig();
|
|
55
|
+
const d = cfg?.desktop || cfg?.overlay || {};
|
|
56
|
+
const num = (v, def) => (typeof v === "number" && isFinite(v) ? v : def);
|
|
57
|
+
return {
|
|
58
|
+
silence_ms: Math.max(400, num(d.silence_ms, 1200)),
|
|
59
|
+
voice_rms: Math.max(0, num(d.voice_rms, 0.025)),
|
|
60
|
+
};
|
|
61
|
+
}
|
|
62
|
+
|
|
49
63
|
function readToken() {
|
|
50
64
|
try { return fs.readFileSync(TOKEN_PATH, "utf8").trim(); } catch { return ""; }
|
|
51
65
|
}
|
|
@@ -397,6 +411,7 @@ ipcMain.handle("get-shortcut", () => getShortcut());
|
|
|
397
411
|
ipcMain.handle("get-theme", () => getTheme());
|
|
398
412
|
ipcMain.handle("get-position", () => getPosition());
|
|
399
413
|
ipcMain.handle("get-agent-name", () => getAgentName());
|
|
414
|
+
ipcMain.handle("get-voice-timing", () => getVoiceTiming());
|
|
400
415
|
|
|
401
416
|
// Renderer asks main to grow/shrink the window to fit its content.
|
|
402
417
|
// Clamped to [WIN_H_MIN, getMaxWindowHeight()]; same anchor (top edge stays put).
|
|
@@ -411,27 +426,29 @@ ipcMain.on("resize-window", (_e, { height }) => {
|
|
|
411
426
|
// Renderer asks for TTS playback of the agent reply. We synthesize via the
|
|
412
427
|
// daemon and pipe the audio path back as a daemon-event the renderer already
|
|
413
428
|
// knows how to consume (tts-ready { url, duration } / tts-failed).
|
|
414
|
-
ipcMain.handle("request-tts", async (_e, { text }) => {
|
|
429
|
+
ipcMain.handle("request-tts", async (_e, { text, seg }) => {
|
|
415
430
|
if (!text || !text.trim()) {
|
|
416
|
-
mainWindow?.webContents.send("daemon-event", { type: "tts-failed" });
|
|
431
|
+
mainWindow?.webContents.send("daemon-event", { type: "tts-failed", seg });
|
|
417
432
|
return;
|
|
418
433
|
}
|
|
419
434
|
try {
|
|
420
435
|
const result = await daemonTtsSay(text);
|
|
421
436
|
if (result?.ok && result.audio_path) {
|
|
422
437
|
// Expose the local file via file:// — preload's contextIsolation lets
|
|
423
|
-
// the renderer's <audio> tag fetch it directly.
|
|
438
|
+
// the renderer's <audio> tag fetch it directly. `seg` ties this audio to
|
|
439
|
+
// the bubble that asked for it.
|
|
424
440
|
const url = "file://" + result.audio_path;
|
|
425
441
|
mainWindow?.webContents.send("daemon-event", {
|
|
426
442
|
type: "tts-ready",
|
|
443
|
+
seg,
|
|
427
444
|
url,
|
|
428
445
|
duration: result.duration_s || 0,
|
|
429
446
|
});
|
|
430
447
|
} else {
|
|
431
|
-
mainWindow?.webContents.send("daemon-event", { type: "tts-failed", error: result?.error || "no audio" });
|
|
448
|
+
mainWindow?.webContents.send("daemon-event", { type: "tts-failed", seg, error: result?.error || "no audio" });
|
|
432
449
|
}
|
|
433
450
|
} catch (e) {
|
|
434
|
-
mainWindow?.webContents.send("daemon-event", { type: "tts-failed", error: e.message });
|
|
451
|
+
mainWindow?.webContents.send("daemon-event", { type: "tts-failed", seg, error: e.message });
|
|
435
452
|
}
|
|
436
453
|
});
|
|
437
454
|
|
|
@@ -462,6 +479,32 @@ ipcMain.handle("check-whisper-ready", () => {
|
|
|
462
479
|
});
|
|
463
480
|
});
|
|
464
481
|
|
|
482
|
+
// Renderer asks to keep STT warm. Routed through the daemon (not whisper
|
|
483
|
+
// directly) so it both LOADS the model if it idled out and resets the idle
|
|
484
|
+
// watchdog. Fire-and-forget from the renderer's side.
|
|
485
|
+
ipcMain.handle("warmup-stt", async () => {
|
|
486
|
+
return new Promise((resolve) => {
|
|
487
|
+
const token = readToken();
|
|
488
|
+
const options = {
|
|
489
|
+
hostname: DAEMON_HOST,
|
|
490
|
+
port: DAEMON_PORT,
|
|
491
|
+
path: "/transcribe/warmup",
|
|
492
|
+
method: "GET",
|
|
493
|
+
headers: { ...(token ? { "Authorization": `Bearer ${token}` } : {}) },
|
|
494
|
+
};
|
|
495
|
+
const req = http.request(options, (res) => {
|
|
496
|
+
let data = "";
|
|
497
|
+
res.on("data", (c) => data += c);
|
|
498
|
+
res.on("end", () => { try { resolve(JSON.parse(data)); } catch { resolve({ ok: false }); } });
|
|
499
|
+
});
|
|
500
|
+
req.on("error", () => resolve({ ok: false }));
|
|
501
|
+
// Cold model load can take ~30s; give it room. (Renderer fires this
|
|
502
|
+
// fire-and-forget, so a long warm-up never blocks the UI.)
|
|
503
|
+
req.setTimeout(45000, () => { req.destroy(); resolve({ ok: false }); });
|
|
504
|
+
req.end();
|
|
505
|
+
});
|
|
506
|
+
});
|
|
507
|
+
|
|
465
508
|
// Renderer requests recording toggle (ESC cancels, shortcut toggles)
|
|
466
509
|
ipcMain.handle("toggle-recording", async () => {
|
|
467
510
|
if (isRecording) stopRecording(); else startRecording();
|
|
@@ -18,14 +18,19 @@ contextBridge.exposeInMainWorld("apx", {
|
|
|
18
18
|
// Check if the whisper model is loaded (false = still loading)
|
|
19
19
|
checkWhisperReady: () => ipcRenderer.invoke("check-whisper-ready"),
|
|
20
20
|
|
|
21
|
+
// Keep STT warm (loads the model if idle + resets the idle timer). Called
|
|
22
|
+
// while the window is open / on mic-open so the first decode isn't cold.
|
|
23
|
+
warmupStt: () => ipcRenderer.invoke("warmup-stt").catch(() => ({ ok: false })),
|
|
24
|
+
|
|
21
25
|
// Send final text to daemon
|
|
22
26
|
sendMessage: (text, previousMessages) =>
|
|
23
27
|
ipcRenderer.invoke("send-message", { text, previousMessages }),
|
|
24
28
|
|
|
25
|
-
//
|
|
26
|
-
//
|
|
27
|
-
|
|
28
|
-
|
|
29
|
+
// Ask main to synthesize TTS for one segment. `seg` correlates the resulting
|
|
30
|
+
// tts-ready/tts-failed event back to the bubble that requested it (each
|
|
31
|
+
// assistant message has its own audio). Returns true optimistically.
|
|
32
|
+
requestTts: (text, seg) => {
|
|
33
|
+
ipcRenderer.invoke("request-tts", { text, seg }).catch(() => {});
|
|
29
34
|
return true; // optimistic; renderer waits for the event either way
|
|
30
35
|
},
|
|
31
36
|
|
|
@@ -42,6 +47,7 @@ contextBridge.exposeInMainWorld("apx", {
|
|
|
42
47
|
getTheme: () => ipcRenderer.invoke("get-theme"),
|
|
43
48
|
getPosition: () => ipcRenderer.invoke("get-position"),
|
|
44
49
|
getAgentName: () => ipcRenderer.invoke("get-agent-name"),
|
|
50
|
+
getVoiceTiming: () => ipcRenderer.invoke("get-voice-timing"),
|
|
45
51
|
|
|
46
52
|
// Renderer asks main to resize the BrowserWindow to the rendered height
|
|
47
53
|
resize: (height) => ipcRenderer.send("resize-window", { height }),
|
|
@@ -31,15 +31,49 @@
|
|
|
31
31
|
let recorderFormat = "webm";
|
|
32
32
|
let liveBusy = false;
|
|
33
33
|
|
|
34
|
+
// Mic is async to open (getUserMedia + recorder warm-up). Until it's actually
|
|
35
|
+
// capturing we show a "Cargando…" state instead of the wave, so the user
|
|
36
|
+
// doesn't talk into the dead gap before the recorder starts.
|
|
37
|
+
let micReady = false;
|
|
38
|
+
|
|
39
|
+
// Silence auto-send: once speech has been heard, SILENCE_MS of quiet
|
|
40
|
+
// auto-commits the recording. RMS (time-domain) is the voice/silence gate.
|
|
41
|
+
// Both are overridable from config.json (desktop.silence_ms / voice_rms).
|
|
42
|
+
let speechSeen = false;
|
|
43
|
+
let lastVoiceTs = 0;
|
|
44
|
+
let SILENCE_MS = 1200; // quiet after speech → send on its own
|
|
45
|
+
let VOICE_RMS = 0.025; // RMS above this counts as voice (0 = silence)
|
|
46
|
+
const PAUSE_PREVIEW_MS = 600; // a short pause kicks ONE decode (reused on send)
|
|
47
|
+
|
|
48
|
+
// When a pause triggers a preview decode, that decode already covers all the
|
|
49
|
+
// speech (the tail is just trailing silence), so the auto-send reuses it
|
|
50
|
+
// instead of paying a second full decode. These coordinate that handoff.
|
|
51
|
+
let pausePreviewed = false; // a preview decode fired for the current pause
|
|
52
|
+
let reuseLiveOnStop = false; // commit should reuse pendingUserText, not re-decode
|
|
53
|
+
let livePromise = null; // in-flight preview decode (awaited on reuse)
|
|
54
|
+
|
|
34
55
|
// Web Audio analyser — drives the live capsule wave from real mic amplitude
|
|
35
56
|
let audioCtx = null;
|
|
36
57
|
let analyser = null;
|
|
37
58
|
let freqData = null;
|
|
59
|
+
let timeData = null;
|
|
38
60
|
let waveRaf = null;
|
|
39
61
|
|
|
40
|
-
let streamingAgentEntry = null; //
|
|
41
|
-
let toolPillsByName = {}; // active tool pills
|
|
42
|
-
let ttsAudio = null; // <audio> playing
|
|
62
|
+
let streamingAgentEntry = null; // legacy single-bubble streaming (kept dormant)
|
|
63
|
+
let toolPillsByName = {}; // active tool pills, by tool name, for the live turn
|
|
64
|
+
let ttsAudio = null; // <audio> currently playing
|
|
65
|
+
|
|
66
|
+
// ── Per-segment turn rendering ──────────────────────────────────────────
|
|
67
|
+
// A turn is now N agent message bubbles (intro, post-tool answer, …), each
|
|
68
|
+
// with its own audio. `currentTurn` tags every bubble of a turn so regen can
|
|
69
|
+
// drop the whole turn. The audio queue plays segment audios in seq order
|
|
70
|
+
// (gapless auto-play), waiting at the cursor for each segment's TTS to land.
|
|
71
|
+
let currentTurn = 0;
|
|
72
|
+
let turnAudios = []; // [{ m, ready, failed, played }] ordered by seq
|
|
73
|
+
let audioCursor = 0; // index of the next segment to play
|
|
74
|
+
let queuePlaying = false; // a segment audio is currently playing
|
|
75
|
+
let turnDone = false; // `done` received for the active turn
|
|
76
|
+
let turnWatchdog = null; // flushes the queue if a segment's TTS hangs
|
|
43
77
|
|
|
44
78
|
let history = []; // [{role:'user'|'assistant', content}] sent to daemon for context
|
|
45
79
|
let theme = "light";
|
|
@@ -119,10 +153,15 @@
|
|
|
119
153
|
window.apx?.getPosition?.() ?? "right",
|
|
120
154
|
window.apx?.getShortcut?.() ?? "CommandOrControl+G",
|
|
121
155
|
window.apx?.getAgentName?.() ?? "Superagente",
|
|
122
|
-
|
|
156
|
+
window.apx?.getVoiceTiming?.() ?? null,
|
|
157
|
+
]).then(([th, pos, shortcut, name, timing]) => {
|
|
123
158
|
theme = th || "light";
|
|
124
159
|
position = pos || "right";
|
|
125
160
|
agentName = (name && String(name).trim()) || "Superagente";
|
|
161
|
+
if (timing) {
|
|
162
|
+
if (typeof timing.silence_ms === "number") SILENCE_MS = timing.silence_ms;
|
|
163
|
+
if (typeof timing.voice_rms === "number") VOICE_RMS = timing.voice_rms;
|
|
164
|
+
}
|
|
126
165
|
document.documentElement.setAttribute("data-theme", theme);
|
|
127
166
|
setPosition(position);
|
|
128
167
|
initialCaption(shortcut);
|
|
@@ -217,6 +256,13 @@
|
|
|
217
256
|
}
|
|
218
257
|
}
|
|
219
258
|
// else: input already there → leave it alone (preserves focus + caret)
|
|
259
|
+
} else if (mode === "listening" && !micReady) {
|
|
260
|
+
// Mic still opening (getUserMedia + recorder warm-up). Show a loading
|
|
261
|
+
// status so the user waits for capture instead of talking into the gap.
|
|
262
|
+
if ($capCenter.dataset.mode !== "loading") {
|
|
263
|
+
$capCenter.dataset.mode = "loading";
|
|
264
|
+
$capCenter.innerHTML = `<span class="status"><span class="dots"><i></i><i></i><i></i></span><span class="shimmer">Cargando…</span></span>`;
|
|
265
|
+
}
|
|
220
266
|
} else if (mode === "listening") {
|
|
221
267
|
// Only rebuild the wave if it's not already there (avoids restarting
|
|
222
268
|
// CSS animations / Web Audio binding every render).
|
|
@@ -246,9 +292,10 @@
|
|
|
246
292
|
}
|
|
247
293
|
}
|
|
248
294
|
}
|
|
249
|
-
// Clear data-mode when we're back to idle
|
|
250
|
-
// re-renders correctly.
|
|
251
|
-
|
|
295
|
+
// Clear data-mode when we're back to idle, or once the live wave is up, so
|
|
296
|
+
// a future busy mode re-renders correctly. While the mic is still warming
|
|
297
|
+
// up we keep the "loading" marker so "Cargando…" isn't rebuilt every frame.
|
|
298
|
+
if (mode === "idle" || (mode === "listening" && micReady)) $capCenter.dataset.mode = "";
|
|
252
299
|
|
|
253
300
|
// actions
|
|
254
301
|
$capActions.innerHTML = "";
|
|
@@ -273,7 +320,8 @@
|
|
|
273
320
|
}
|
|
274
321
|
} else if (mode === "listening") {
|
|
275
322
|
addBtn("ghost", "Cancelar", ICON.x(), () => cancel());
|
|
276
|
-
|
|
323
|
+
// No "Enviar" until the recorder is live — nothing to send mid-warm-up.
|
|
324
|
+
if (micReady) addBtn("", "Enviar", ICON.send(), () => stopListening(/* commit */ true));
|
|
277
325
|
} else if (mode === "transcribing") {
|
|
278
326
|
addBtn("ghost", "Cancelar", ICON.x(), () => cancel());
|
|
279
327
|
} else if (mode === "thinking") {
|
|
@@ -321,7 +369,6 @@
|
|
|
321
369
|
// Re-render all existing turns
|
|
322
370
|
messages.forEach((m, i) => appendTurn(m, i === messages.length - 1));
|
|
323
371
|
if (mode === "transcribing") renderPendingUserPartial();
|
|
324
|
-
if (mode === "thinking" || mode === "speaking") ensureStreamingAgentBubble();
|
|
325
372
|
}
|
|
326
373
|
}
|
|
327
374
|
|
|
@@ -388,7 +435,10 @@
|
|
|
388
435
|
if (history.length && history[history.length - 1].role === "assistant") {
|
|
389
436
|
history.pop();
|
|
390
437
|
}
|
|
391
|
-
|
|
438
|
+
// A turn can be several agent bubbles (intro + post-tool answer…); drop
|
|
439
|
+
// them all so regen replaces the whole turn, not just the last segment.
|
|
440
|
+
const turnId = m.turn;
|
|
441
|
+
messages = messages.filter((x) => !(x.role === "agent" && turnId != null && x.turn === turnId) && x.id !== m.id);
|
|
392
442
|
rebuildConvFromState();
|
|
393
443
|
startAgentTurn();
|
|
394
444
|
sendToDaemon(lastUser.text);
|
|
@@ -499,12 +549,14 @@
|
|
|
499
549
|
}
|
|
500
550
|
|
|
501
551
|
function addToolPill(name) {
|
|
502
|
-
|
|
503
|
-
if (toolPillsByName[name]) return;
|
|
552
|
+
ensureConv();
|
|
553
|
+
if (!$convScroll || toolPillsByName[name]) return;
|
|
504
554
|
const pill = document.createElement("div");
|
|
505
555
|
pill.className = "tool-pill";
|
|
506
556
|
pill.innerHTML = `<div class="spinner"></div><span>${escapeHtml(name)}</span>`;
|
|
507
|
-
|
|
557
|
+
// Append at the end of the conversation flow — pills sit between the
|
|
558
|
+
// segment bubbles in the order tools actually run.
|
|
559
|
+
$convScroll.appendChild(pill);
|
|
508
560
|
toolPillsByName[name] = pill;
|
|
509
561
|
scrollConvToBottom();
|
|
510
562
|
}
|
|
@@ -541,53 +593,46 @@
|
|
|
541
593
|
const dur = m.dur || 1;
|
|
542
594
|
const fmt = (s) => `0:${String(Math.round(s)).padStart(2, "0")}`;
|
|
543
595
|
const audio = new Audio(m.audio);
|
|
596
|
+
m._audioEl = audio; // the audio queue drives sequential playback
|
|
544
597
|
let raf = null;
|
|
545
|
-
let progress = 0;
|
|
546
598
|
|
|
547
599
|
const setProgress = (p) => {
|
|
548
|
-
|
|
549
|
-
const cur = Math.floor(
|
|
600
|
+
p = Math.max(0, Math.min(1, p));
|
|
601
|
+
const cur = Math.floor(p * N);
|
|
550
602
|
bars.forEach((b, i) => {
|
|
551
603
|
b.classList.toggle("on", i <= cur);
|
|
552
604
|
b.classList.toggle("cur", i === cur && !audio.paused);
|
|
553
605
|
});
|
|
554
|
-
$dur.textContent =
|
|
606
|
+
$dur.textContent = p > 0 || !audio.paused ? fmt(p * dur) : fmt(dur);
|
|
555
607
|
};
|
|
556
|
-
|
|
557
608
|
const tick = () => {
|
|
558
609
|
if (audio.duration > 0) setProgress(audio.currentTime / audio.duration);
|
|
559
610
|
raf = requestAnimationFrame(tick);
|
|
560
611
|
};
|
|
561
|
-
audio.addEventListener("play",
|
|
562
|
-
audio.addEventListener("pause",
|
|
563
|
-
audio.addEventListener("ended",
|
|
612
|
+
audio.addEventListener("play", () => { $play.innerHTML = ICON.pause(); raf = requestAnimationFrame(tick); if (mode !== "speaking") { mode = "speaking"; render(); } });
|
|
613
|
+
audio.addEventListener("pause", () => { $play.innerHTML = ICON.play(); if (raf) cancelAnimationFrame(raf); });
|
|
614
|
+
audio.addEventListener("ended", () => { $play.innerHTML = ICON.play(); if (raf) cancelAnimationFrame(raf); setProgress(1); onSegmentEnded(m); });
|
|
615
|
+
// 404 / decode error / autoplay block: don't hang — advance the queue.
|
|
616
|
+
audio.addEventListener("error", () => onSegmentEnded(m));
|
|
564
617
|
|
|
565
|
-
$play.addEventListener("click", () =>
|
|
618
|
+
$play.addEventListener("click", () => {
|
|
619
|
+
if (audio.paused) {
|
|
620
|
+
// Manual play takes control — stop the auto-sequence so we don't fight it.
|
|
621
|
+
queuePlaying = false;
|
|
622
|
+
try { if (ttsAudio && ttsAudio !== audio && !ttsAudio.ended) ttsAudio.pause(); } catch {}
|
|
623
|
+
ttsAudio = audio;
|
|
624
|
+
audio.play().catch(() => { if (mode === "speaking") { mode = "idle"; render(); } });
|
|
625
|
+
} else {
|
|
626
|
+
audio.pause();
|
|
627
|
+
if (mode === "speaking") { mode = "idle"; render(); }
|
|
628
|
+
}
|
|
629
|
+
});
|
|
566
630
|
$bar.addEventListener("click", (e) => {
|
|
567
631
|
const r = $bar.getBoundingClientRect();
|
|
568
632
|
const p = Math.max(0, Math.min(1, (e.clientX - r.left) / r.width));
|
|
569
633
|
if (audio.duration > 0) audio.currentTime = p * audio.duration;
|
|
570
634
|
setProgress(p);
|
|
571
635
|
});
|
|
572
|
-
|
|
573
|
-
// If the audio errors out (404, decode error, autoplay block, etc) make
|
|
574
|
-
// sure the capsule doesn't stay stuck in "está hablando…".
|
|
575
|
-
audio.addEventListener("error", () => {
|
|
576
|
-
if (mode === "speaking") { mode = "idle"; render(); }
|
|
577
|
-
});
|
|
578
|
-
|
|
579
|
-
// autoplay if it's the fresh reply
|
|
580
|
-
if (m.fresh) {
|
|
581
|
-
m.fresh = false;
|
|
582
|
-
ttsAudio?.pause?.();
|
|
583
|
-
ttsAudio = audio;
|
|
584
|
-
audio.play().catch(() => {
|
|
585
|
-
// Autoplay block (rare in Electron with user-gesture but possible
|
|
586
|
-
// when the window has never been focused). Bail out so the capsule
|
|
587
|
-
// returns to idle and the user can still tap "play" on the scrubber.
|
|
588
|
-
if (mode === "speaking" || mode === "thinking") { mode = "idle"; render(); }
|
|
589
|
-
});
|
|
590
|
-
}
|
|
591
636
|
}
|
|
592
637
|
|
|
593
638
|
// Post-finalize hook: add a scrubber to an already-rendered agent turn
|
|
@@ -597,15 +642,18 @@
|
|
|
597
642
|
if (!m) return;
|
|
598
643
|
m.audio = url;
|
|
599
644
|
m.dur = dur || 0;
|
|
600
|
-
m.fresh = true; // autoplay the freshly-arrived reply
|
|
601
645
|
const turnEl = $convScroll?.querySelector(`[data-id="${turnId}"]`);
|
|
602
|
-
if (!turnEl)
|
|
603
|
-
|
|
604
|
-
|
|
605
|
-
|
|
606
|
-
|
|
607
|
-
|
|
608
|
-
|
|
646
|
+
if (turnEl && !turnEl.querySelector(".audio")) {
|
|
647
|
+
// Insert the scrubber HTML just before turn-actions (matches appendTurn).
|
|
648
|
+
const actions = turnEl.querySelector(".turn-actions");
|
|
649
|
+
const html = buildScrubberHtml(m);
|
|
650
|
+
if (actions) actions.insertAdjacentHTML("beforebegin", html);
|
|
651
|
+
else turnEl.insertAdjacentHTML("beforeend", html);
|
|
652
|
+
wireScrubber(turnEl, m); // sets m._audioEl
|
|
653
|
+
}
|
|
654
|
+
// Audio is ready → let the sequential queue play it when it's this
|
|
655
|
+
// segment's turn (gapless auto-play across the turn's bubbles).
|
|
656
|
+
queueMarkReady(m);
|
|
609
657
|
scrollConvToBottom();
|
|
610
658
|
}
|
|
611
659
|
|
|
@@ -620,10 +668,94 @@
|
|
|
620
668
|
return out;
|
|
621
669
|
}
|
|
622
670
|
|
|
671
|
+
// ── Per-turn setup + sequential audio queue ──────────────────────────────
|
|
672
|
+
// Each turn renders N agent bubbles (segments), each with its own audio. We
|
|
673
|
+
// play those audios in `seq` order, gaplessly: the cursor waits at a segment
|
|
674
|
+
// until its TTS lands, plays it, then advances. So Roby "speaks" its messages
|
|
675
|
+
// one after another even though they synthesize at different speeds.
|
|
676
|
+
function beginAgentTurn() {
|
|
677
|
+
currentTurn++;
|
|
678
|
+
resetTurnAudio();
|
|
679
|
+
doneHandled = false;
|
|
680
|
+
pendingTtsTurnId = null;
|
|
681
|
+
if (ttsTimer) { clearTimeout(ttsTimer); ttsTimer = null; }
|
|
682
|
+
}
|
|
683
|
+
function resetTurnAudio() {
|
|
684
|
+
try { ttsAudio?.pause?.(); } catch {}
|
|
685
|
+
ttsAudio = null;
|
|
686
|
+
turnAudios = [];
|
|
687
|
+
audioCursor = 0;
|
|
688
|
+
queuePlaying = false;
|
|
689
|
+
turnDone = false;
|
|
690
|
+
if (turnWatchdog) { clearTimeout(turnWatchdog); turnWatchdog = null; }
|
|
691
|
+
}
|
|
692
|
+
function queueRegisterSegment(m) {
|
|
693
|
+
if (!turnAudios.some((e) => e.m === m)) {
|
|
694
|
+
turnAudios.push({ m, ready: false, failed: false, played: false });
|
|
695
|
+
turnAudios.sort((a, b) => (a.m.seq || 0) - (b.m.seq || 0));
|
|
696
|
+
}
|
|
697
|
+
}
|
|
698
|
+
function queueMarkReady(m) {
|
|
699
|
+
const e = turnAudios.find((x) => x.m === m);
|
|
700
|
+
if (e) e.ready = true;
|
|
701
|
+
pumpAudioQueue();
|
|
702
|
+
}
|
|
703
|
+
function queueMarkFailed(m) {
|
|
704
|
+
const e = turnAudios.find((x) => x.m === m);
|
|
705
|
+
if (e) { e.ready = true; e.failed = true; e.played = true; }
|
|
706
|
+
pumpAudioQueue();
|
|
707
|
+
}
|
|
708
|
+
function pumpAudioQueue() {
|
|
709
|
+
if (queuePlaying) return;
|
|
710
|
+
while (audioCursor < turnAudios.length) {
|
|
711
|
+
const e = turnAudios[audioCursor];
|
|
712
|
+
if (!e.ready) return; // wait for this segment's TTS
|
|
713
|
+
if (e.played || e.failed || !e.m._audioEl) { audioCursor++; continue; }
|
|
714
|
+
const audio = e.m._audioEl;
|
|
715
|
+
queuePlaying = true;
|
|
716
|
+
try { if (ttsAudio && ttsAudio !== audio && !ttsAudio.ended) ttsAudio.pause(); } catch {}
|
|
717
|
+
ttsAudio = audio;
|
|
718
|
+
audio.play().catch(() => { // autoplay blocked / decode error
|
|
719
|
+
queuePlaying = false;
|
|
720
|
+
e.played = true;
|
|
721
|
+
audioCursor++;
|
|
722
|
+
pumpAudioQueue();
|
|
723
|
+
});
|
|
724
|
+
return;
|
|
725
|
+
}
|
|
726
|
+
// Drained. Once the turn is done and nothing's left, return to idle.
|
|
727
|
+
if (turnDone) {
|
|
728
|
+
if (turnWatchdog) { clearTimeout(turnWatchdog); turnWatchdog = null; }
|
|
729
|
+
if (mode === "speaking" || mode === "thinking") { mode = "idle"; render(); }
|
|
730
|
+
}
|
|
731
|
+
}
|
|
732
|
+
// Called from a segment audio's `ended` (or `error`). Advances the queue.
|
|
733
|
+
function onSegmentEnded(m) {
|
|
734
|
+
const e = turnAudios.find((x) => x.m === m);
|
|
735
|
+
if (e) { if (e.played) return; e.played = true; }
|
|
736
|
+
if (queuePlaying && ttsAudio === m._audioEl) {
|
|
737
|
+
queuePlaying = false;
|
|
738
|
+
audioCursor++;
|
|
739
|
+
pumpAudioQueue();
|
|
740
|
+
} else if (mode === "speaking") {
|
|
741
|
+
mode = "idle"; render();
|
|
742
|
+
}
|
|
743
|
+
}
|
|
744
|
+
|
|
623
745
|
// ── Recording flow ───────────────────────────────────────────────────────
|
|
624
746
|
function startListening() {
|
|
625
747
|
if (mode !== "idle") return;
|
|
626
748
|
isCancelled = false;
|
|
749
|
+
micReady = false; // show "Cargando…" until the recorder is actually live
|
|
750
|
+
speechSeen = false;
|
|
751
|
+
lastVoiceTs = 0;
|
|
752
|
+
pausePreviewed = false;
|
|
753
|
+
reuseLiveOnStop = false;
|
|
754
|
+
livePromise = null;
|
|
755
|
+
pendingUserText = "";
|
|
756
|
+
// Warm the whisper model now (overlaps the mic warm-up), so the decode at
|
|
757
|
+
// the end of this utterance doesn't pay a cold start.
|
|
758
|
+
window.apx?.warmupStt?.();
|
|
627
759
|
mode = "listening";
|
|
628
760
|
render();
|
|
629
761
|
startMic();
|
|
@@ -649,6 +781,7 @@
|
|
|
649
781
|
if (mode === "listening") { stopMic(); }
|
|
650
782
|
if (mode === "thinking" || mode === "speaking") { window.apx?.cancel?.(); }
|
|
651
783
|
removePendingUserPartial();
|
|
784
|
+
resetTurnAudio(); // stop any playing/queued segment audio
|
|
652
785
|
if (streamingAgentEntry) {
|
|
653
786
|
streamingAgentEntry.el.remove();
|
|
654
787
|
streamingAgentEntry = null;
|
|
@@ -658,6 +791,8 @@
|
|
|
658
791
|
}
|
|
659
792
|
|
|
660
793
|
function stopSpeaking() {
|
|
794
|
+
// Halt the auto-sequence and the current segment.
|
|
795
|
+
queuePlaying = false;
|
|
661
796
|
try { ttsAudio?.pause?.(); } catch {}
|
|
662
797
|
if (mode === "speaking") { mode = "idle"; render(); }
|
|
663
798
|
}
|
|
@@ -678,6 +813,7 @@
|
|
|
678
813
|
analyser.maxDecibels = -15; // ceiling (loud speech)
|
|
679
814
|
src.connect(analyser);
|
|
680
815
|
freqData = new Uint8Array(analyser.frequencyBinCount);
|
|
816
|
+
timeData = new Uint8Array(analyser.fftSize);
|
|
681
817
|
startWaveLoop();
|
|
682
818
|
} catch (e) {
|
|
683
819
|
console.warn("desktop renderer: AnalyserNode init failed", e);
|
|
@@ -691,14 +827,27 @@
|
|
|
691
827
|
recordedChunks = [];
|
|
692
828
|
mediaRecorder = new MediaRecorder(audioStream, { mimeType, audioBitsPerSecond: 32000 });
|
|
693
829
|
mediaRecorder.ondataavailable = (e) => {
|
|
830
|
+
// Just buffer. We deliberately do NOT decode on every chunk anymore —
|
|
831
|
+
// re-decoding the growing clip every 2s serialized on the single
|
|
832
|
+
// whisper thread and the final decode queued behind it (the old ~10s
|
|
833
|
+
// stall). Transcription now happens once, on a pause / on stop.
|
|
694
834
|
if (e.data && e.data.size > 0) recordedChunks.push(e.data);
|
|
695
|
-
runLivePartial();
|
|
696
835
|
};
|
|
697
836
|
mediaRecorder.onstop = async () => {
|
|
698
837
|
if (isCancelled) { recordedChunks = []; if (mode !== "idle") { mode = "idle"; render(); } return; }
|
|
699
|
-
|
|
700
|
-
|
|
838
|
+
let text = "";
|
|
839
|
+
// Auto-send after a pause: the pause already kicked a full decode that
|
|
840
|
+
// covers all the speech (the only thing after it is trailing silence),
|
|
841
|
+
// so reuse it instead of decoding the same audio again. Await the
|
|
842
|
+
// in-flight preview if it hasn't settled yet.
|
|
843
|
+
if (reuseLiveOnStop) {
|
|
844
|
+
if (livePromise) { try { await livePromise; } catch {} }
|
|
845
|
+
text = (pendingUserText || "").trim();
|
|
846
|
+
}
|
|
847
|
+
// Manual send (Enviar / ⌘G release) or no preview yet → one fresh decode.
|
|
848
|
+
if (!text) text = (await transcribeBuffered()).trim();
|
|
701
849
|
recordedChunks = [];
|
|
850
|
+
reuseLiveOnStop = false;
|
|
702
851
|
// Guard with .trim() — whisper occasionally returns a single space or
|
|
703
852
|
// newline for very short clips, which used to commit an empty bubble.
|
|
704
853
|
if (!text || isCancelled) {
|
|
@@ -711,7 +860,16 @@
|
|
|
711
860
|
pendingUserText = text;
|
|
712
861
|
commitUserMessage(text, /* via */ "voice");
|
|
713
862
|
};
|
|
714
|
-
|
|
863
|
+
// 1s timeslice: chunks land often enough that a pause-preview decode has
|
|
864
|
+
// audio to work with even for short utterances. We no longer decode per
|
|
865
|
+
// chunk (just buffer), so a smaller slice is essentially free.
|
|
866
|
+
mediaRecorder.start(1000);
|
|
867
|
+
// Recorder is now live → swap "Cargando…" for the reactive wave and let
|
|
868
|
+
// silence detection arm. lastVoiceTs starts now so a fully silent open
|
|
869
|
+
// won't auto-send (speechSeen gates that).
|
|
870
|
+
micReady = true;
|
|
871
|
+
lastVoiceTs = Date.now();
|
|
872
|
+
if (mode === "listening") render();
|
|
715
873
|
} catch (e) {
|
|
716
874
|
console.error("desktop renderer: mic error", e);
|
|
717
875
|
mode = "idle";
|
|
@@ -723,11 +881,16 @@
|
|
|
723
881
|
try { audioStream?.getTracks().forEach((t) => t.stop()); } catch {}
|
|
724
882
|
mediaRecorder = null;
|
|
725
883
|
audioStream = null;
|
|
884
|
+
micReady = false;
|
|
885
|
+
speechSeen = false;
|
|
886
|
+
lastVoiceTs = 0;
|
|
887
|
+
pausePreviewed = false;
|
|
726
888
|
stopWaveLoop();
|
|
727
889
|
try { audioCtx?.close(); } catch {}
|
|
728
890
|
audioCtx = null;
|
|
729
891
|
analyser = null;
|
|
730
892
|
freqData = null;
|
|
893
|
+
timeData = null;
|
|
731
894
|
}
|
|
732
895
|
|
|
733
896
|
// ── Reactive wave: amplitude-driven bar heights (runs while mode === listening)
|
|
@@ -738,6 +901,43 @@
|
|
|
738
901
|
const tick = () => {
|
|
739
902
|
if (mode !== "listening" || !analyser) { waveRaf = null; return; }
|
|
740
903
|
analyser.getByteFrequencyData(freqData);
|
|
904
|
+
|
|
905
|
+
// ── Silence auto-send ──────────────────────────────────────────────
|
|
906
|
+
// Time-domain RMS is a reliable voice/silence gate (unlike the freq
|
|
907
|
+
// bars, it's independent of the analyser's dB scaling). Once we've heard
|
|
908
|
+
// speech, SILENCE_MS of quiet commits the recording on its own.
|
|
909
|
+
if (micReady && timeData) {
|
|
910
|
+
analyser.getByteTimeDomainData(timeData);
|
|
911
|
+
let sumSq = 0;
|
|
912
|
+
for (let i = 0; i < timeData.length; i++) {
|
|
913
|
+
const v = (timeData[i] - 128) / 128;
|
|
914
|
+
sumSq += v * v;
|
|
915
|
+
}
|
|
916
|
+
const rms = Math.sqrt(sumSq / timeData.length);
|
|
917
|
+
const now = Date.now();
|
|
918
|
+
if (rms > VOICE_RMS) {
|
|
919
|
+
speechSeen = true;
|
|
920
|
+
lastVoiceTs = now;
|
|
921
|
+
pausePreviewed = false; // new speech → allow a fresh preview
|
|
922
|
+
} else if (speechSeen && lastVoiceTs) {
|
|
923
|
+
const silentFor = now - lastVoiceTs;
|
|
924
|
+
// A short pause kicks ONE decode of everything said so far. It doubles
|
|
925
|
+
// as the final transcription, so the auto-send below is instant
|
|
926
|
+
// instead of paying a decode after stop.
|
|
927
|
+
if (!pausePreviewed && silentFor >= PAUSE_PREVIEW_MS && !liveBusy) {
|
|
928
|
+
pausePreviewed = true;
|
|
929
|
+
runLivePartial();
|
|
930
|
+
}
|
|
931
|
+
// Sustained silence → auto-send, reusing the pause decode.
|
|
932
|
+
if (silentFor >= SILENCE_MS) {
|
|
933
|
+
waveRaf = null;
|
|
934
|
+
reuseLiveOnStop = true;
|
|
935
|
+
stopListening(/* commit */ true);
|
|
936
|
+
return;
|
|
937
|
+
}
|
|
938
|
+
}
|
|
939
|
+
}
|
|
940
|
+
|
|
741
941
|
const wave = $capCenter.querySelector(".cap-wave");
|
|
742
942
|
if (wave) {
|
|
743
943
|
const bars = wave.children;
|
|
@@ -781,17 +981,20 @@
|
|
|
781
981
|
} catch {}
|
|
782
982
|
return "";
|
|
783
983
|
}
|
|
784
|
-
|
|
984
|
+
// Decode what's been recorded so far (fired once per speech pause). The
|
|
985
|
+
// result is stashed in pendingUserText and reused by the auto-send on stop,
|
|
986
|
+
// so the same audio is never decoded twice. livePromise lets onstop await an
|
|
987
|
+
// in-flight decode before reading the text.
|
|
988
|
+
function runLivePartial() {
|
|
785
989
|
if (liveBusy || mode !== "listening" || !recordedChunks.length) return;
|
|
786
990
|
liveBusy = true;
|
|
787
|
-
|
|
788
|
-
|
|
789
|
-
|
|
790
|
-
pendingUserText = text;
|
|
791
|
-
|
|
792
|
-
|
|
793
|
-
|
|
794
|
-
} finally { liveBusy = false; }
|
|
991
|
+
livePromise = (async () => {
|
|
992
|
+
try {
|
|
993
|
+
const text = await transcribeBuffered();
|
|
994
|
+
if (text && mode === "listening") pendingUserText = text;
|
|
995
|
+
} finally { liveBusy = false; }
|
|
996
|
+
})();
|
|
997
|
+
return livePromise;
|
|
795
998
|
}
|
|
796
999
|
|
|
797
1000
|
// ── Send: text path + post-transcription commit path ─────────────────────
|
|
@@ -819,12 +1022,10 @@
|
|
|
819
1022
|
// one ResizeObserver tick later). Shared by commitUserMessage + regen so
|
|
820
1023
|
// both paths set up the daemon-event pipeline identically.
|
|
821
1024
|
function startAgentTurn() {
|
|
822
|
-
|
|
823
|
-
pendingTtsTurnId = null;
|
|
824
|
-
if (ttsTimer) { clearTimeout(ttsTimer); ttsTimer = null; }
|
|
1025
|
+
beginAgentTurn(); // bump currentTurn + reset the audio queue/guards
|
|
825
1026
|
mode = "thinking";
|
|
826
1027
|
render();
|
|
827
|
-
|
|
1028
|
+
ensureConv(); // segments will mount their own bubbles
|
|
828
1029
|
requestWindowResize();
|
|
829
1030
|
}
|
|
830
1031
|
|
|
@@ -861,73 +1062,88 @@
|
|
|
861
1062
|
window.apx?.onDaemonEvent?.((msg) => {
|
|
862
1063
|
switch (msg.type) {
|
|
863
1064
|
case "thinking":
|
|
864
|
-
|
|
865
|
-
|
|
1065
|
+
// Marks the start of a turn. For locally-initiated turns startAgentTurn
|
|
1066
|
+
// already ran beginAgentTurn() (mode is already "thinking"); for turns
|
|
1067
|
+
// NOT initiated in this window (injected / broadcast from another client)
|
|
1068
|
+
// we set them up here so currentTurn/queue/doneHandled are correct and
|
|
1069
|
+
// the turn doesn't hang.
|
|
1070
|
+
if (mode !== "thinking" && mode !== "speaking") {
|
|
1071
|
+
beginAgentTurn();
|
|
1072
|
+
mode = "thinking";
|
|
1073
|
+
render();
|
|
1074
|
+
} else {
|
|
1075
|
+
doneHandled = false;
|
|
1076
|
+
}
|
|
1077
|
+
ensureConv();
|
|
866
1078
|
break;
|
|
867
1079
|
case "token":
|
|
1080
|
+
// Legacy path (backend no longer streams tokens for desktop). Kept so a
|
|
1081
|
+
// mixed-version daemon doesn't break — accumulate into a single bubble.
|
|
868
1082
|
appendStreamingToken(msg.text || "");
|
|
869
1083
|
break;
|
|
870
1084
|
case "tool_start": addToolPill(msg.name); break;
|
|
871
1085
|
case "tool_done": updateToolPill(msg.name); break;
|
|
1086
|
+
case "segment": {
|
|
1087
|
+
// Each segment is its own agent message bubble + its own audio.
|
|
1088
|
+
ensureConv();
|
|
1089
|
+
const text = (msg.text || "").trim();
|
|
1090
|
+
if (!text) break;
|
|
1091
|
+
const id = nextId++;
|
|
1092
|
+
const m = { id, seq: msg.seq || 0, turn: currentTurn, role: "agent", text, t: nowHHMM(), audio: null, dur: null };
|
|
1093
|
+
messages.push(m);
|
|
1094
|
+
appendTurn(m, true);
|
|
1095
|
+
queueRegisterSegment(m);
|
|
1096
|
+
// Synthesize THIS segment; tts-ready(seg=id) attaches its audio + queues
|
|
1097
|
+
// it for gapless sequential playback.
|
|
1098
|
+
window.apx?.requestTts?.(text, id);
|
|
1099
|
+
requestWindowResize();
|
|
1100
|
+
scrollConvToBottom();
|
|
1101
|
+
break;
|
|
1102
|
+
}
|
|
872
1103
|
case "done": {
|
|
873
|
-
// Daemon may emit `done` twice (retry/race). Process only once per turn.
|
|
874
1104
|
if (doneHandled) break;
|
|
875
1105
|
doneHandled = true;
|
|
876
|
-
|
|
877
|
-
//
|
|
878
|
-
|
|
879
|
-
|
|
880
|
-
//
|
|
881
|
-
//
|
|
882
|
-
if (
|
|
883
|
-
|
|
884
|
-
if (!
|
|
885
|
-
|
|
886
|
-
|
|
887
|
-
|
|
888
|
-
|
|
889
|
-
|
|
890
|
-
|
|
891
|
-
|
|
892
|
-
// the already-rendered turn (see attachAudioToLastAgentTurn below).
|
|
893
|
-
const finalizedTurnId = streamingAgentEntry?.id;
|
|
894
|
-
finalizeStreamingAgent();
|
|
895
|
-
mode = "idle"; render();
|
|
896
|
-
// Fire-and-forget TTS request. If it returns audio, attach it to
|
|
897
|
-
// the turn we just rendered; if it errors / times out / never replies,
|
|
898
|
-
// no big deal — the user already has the text. Guard with a 6s soft
|
|
899
|
-
// timeout so a stuck request doesn't hold ttsTimer state.
|
|
900
|
-
const handled = window.apx?.requestTts?.(finalText);
|
|
901
|
-
if (handled) {
|
|
902
|
-
if (ttsTimer) clearTimeout(ttsTimer);
|
|
903
|
-
ttsTimer = setTimeout(() => { ttsTimer = null; }, 6000);
|
|
904
|
-
// Remember which turn the next tts-ready/failed belongs to.
|
|
905
|
-
pendingTtsTurnId = finalizedTurnId || null;
|
|
1106
|
+
turnDone = true;
|
|
1107
|
+
// Record the whole turn as one assistant entry for conversation context.
|
|
1108
|
+
const full = (msg.text || "").trim();
|
|
1109
|
+
if (full) history.push({ role: "assistant", content: full });
|
|
1110
|
+
// Safety net: if some segment's TTS never resolves, flush after 12s so
|
|
1111
|
+
// the capsule can't get stuck in "Pensando…".
|
|
1112
|
+
if (turnWatchdog) clearTimeout(turnWatchdog);
|
|
1113
|
+
turnWatchdog = setTimeout(() => {
|
|
1114
|
+
turnAudios.forEach((e) => { if (!e.ready) { e.ready = true; e.failed = true; e.played = true; } });
|
|
1115
|
+
pumpAudioQueue();
|
|
1116
|
+
}, 12000);
|
|
1117
|
+
// Play whatever audio is already ready; flip to idle if there's nothing
|
|
1118
|
+
// left to play (e.g. a turn that produced no audio).
|
|
1119
|
+
pumpAudioQueue();
|
|
1120
|
+
if (!queuePlaying && audioCursor >= turnAudios.length && mode !== "speaking") {
|
|
1121
|
+
mode = "idle"; render();
|
|
906
1122
|
}
|
|
907
1123
|
break;
|
|
908
1124
|
}
|
|
909
|
-
case "tts-ready":
|
|
910
|
-
if (
|
|
911
|
-
if (pendingTtsTurnId != null) {
|
|
912
|
-
attachAudioToTurn(pendingTtsTurnId, { url: msg.url, dur: msg.duration });
|
|
913
|
-
pendingTtsTurnId = null;
|
|
914
|
-
}
|
|
1125
|
+
case "tts-ready":
|
|
1126
|
+
if (msg.seg != null) attachAudioToTurn(msg.seg, { url: msg.url, dur: msg.duration });
|
|
915
1127
|
break;
|
|
916
|
-
|
|
917
|
-
|
|
918
|
-
|
|
919
|
-
if (
|
|
920
|
-
pendingTtsTurnId = null;
|
|
1128
|
+
case "tts-failed": {
|
|
1129
|
+
// No audio for this segment — skip it in the queue so playback advances.
|
|
1130
|
+
const m = (msg.seg != null) ? messages.find((x) => x.id === msg.seg) : null;
|
|
1131
|
+
if (m) queueMarkFailed(m);
|
|
921
1132
|
break;
|
|
922
|
-
|
|
923
|
-
|
|
1133
|
+
}
|
|
1134
|
+
case "error": {
|
|
1135
|
+
ensureConv();
|
|
1136
|
+
const id = nextId++;
|
|
1137
|
+
const m = { id, seq: 9999, turn: currentTurn, role: "agent", text: "Error: " + (msg.message || "Unknown error"), t: nowHHMM(), isError: true };
|
|
1138
|
+
messages.push(m);
|
|
1139
|
+
appendTurn(m, true);
|
|
1140
|
+
turnDone = true;
|
|
1141
|
+
if (mode !== "speaking") { mode = "idle"; render(); }
|
|
924
1142
|
break;
|
|
1143
|
+
}
|
|
925
1144
|
case "cancelled":
|
|
926
|
-
|
|
927
|
-
|
|
928
|
-
else finalizeStreamingAgent();
|
|
929
|
-
streamingAgentEntry = null;
|
|
930
|
-
}
|
|
1145
|
+
resetTurnAudio();
|
|
1146
|
+
turnDone = true;
|
|
931
1147
|
mode = "idle"; render();
|
|
932
1148
|
break;
|
|
933
1149
|
}
|
|
@@ -949,8 +1165,20 @@
|
|
|
949
1165
|
document.addEventListener("keydown", (e) => {
|
|
950
1166
|
if (e.key === "Escape") {
|
|
951
1167
|
e.preventDefault();
|
|
952
|
-
|
|
953
|
-
|
|
1168
|
+
// Escape cancels whatever is in flight (recording / transcribing /
|
|
1169
|
+
// thinking / speaking). If nothing is in flight, a half-typed draft is
|
|
1170
|
+
// cleared first; only an empty idle capsule closes the window.
|
|
1171
|
+
if (mode === "listening" || mode === "transcribing" || mode === "thinking" || mode === "speaking") {
|
|
1172
|
+
cancel();
|
|
1173
|
+
return;
|
|
1174
|
+
}
|
|
1175
|
+
const input = $capCenter.querySelector("input");
|
|
1176
|
+
if (input && input.value.trim()) {
|
|
1177
|
+
input.value = "";
|
|
1178
|
+
render();
|
|
1179
|
+
} else {
|
|
1180
|
+
closeWindow();
|
|
1181
|
+
}
|
|
954
1182
|
}
|
|
955
1183
|
});
|
|
956
1184
|
|
|
@@ -975,6 +1203,13 @@
|
|
|
975
1203
|
setInterval(requestWindowResize, 250);
|
|
976
1204
|
}
|
|
977
1205
|
|
|
1206
|
+
// ── Keep STT warm ────────────────────────────────────────────────────────
|
|
1207
|
+
// The whisper server idles out after ~10 min. While the desktop window is
|
|
1208
|
+
// running we ping it every 4 min (and once now) so it stays loaded — the
|
|
1209
|
+
// user's first utterance never pays the cold-load cost.
|
|
1210
|
+
window.apx?.warmupStt?.();
|
|
1211
|
+
setInterval(() => { window.apx?.warmupStt?.(); }, 4 * 60 * 1000);
|
|
1212
|
+
|
|
978
1213
|
// ── Helpers ──────────────────────────────────────────────────────────────
|
|
979
1214
|
function nowHHMM() {
|
|
980
1215
|
const d = new Date();
|