@agentprojectcontext/apx 1.27.2 → 1.29.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/host/daemon/api/transcribe.js +12 -0
- package/src/host/daemon/plugins/desktop.js +34 -12
- package/src/host/daemon/transcription.js +29 -0
- package/src/host/daemon/whisper-server.py +11 -0
- package/src/interfaces/desktop/main.js +48 -5
- package/src/interfaces/desktop/preload.js +10 -4
- package/src/interfaces/desktop/renderer.js +373 -127
- package/src/interfaces/desktop/style.css +21 -10
package/package.json
CHANGED
|
@@ -6,6 +6,18 @@
|
|
|
6
6
|
//
|
|
7
7
|
// Shared by overlay, telegram voice messages, and any external caller.
|
|
8
8
|
export function register(app) {
|
|
9
|
+
// GET /transcribe/warmup — load the local whisper model (if needed) and reset
|
|
10
|
+
// its idle watchdog. Callers (e.g. the desktop window) ping this while open so
|
|
11
|
+
// the first real utterance doesn't pay the cold-load cost.
|
|
12
|
+
app.get("/transcribe/warmup", async (_req, res) => {
|
|
13
|
+
try {
|
|
14
|
+
const { warmupWhisper } = await import("../transcription.js");
|
|
15
|
+
res.json(await warmupWhisper());
|
|
16
|
+
} catch (e) {
|
|
17
|
+
res.status(500).json({ ok: false, error: e.message });
|
|
18
|
+
}
|
|
19
|
+
});
|
|
20
|
+
|
|
9
21
|
app.post("/transcribe/chunk", async (req, res) => {
|
|
10
22
|
const chunks = [];
|
|
11
23
|
req.on("data", (c) => chunks.push(c));
|
|
@@ -100,9 +100,27 @@ async function _handleMessage({ ws, text, previousMessages }, { projects, config
|
|
|
100
100
|
await appendGlobalMessage({ channel: CHANNEL, direction: "in", type: "user", author: "user", body: text });
|
|
101
101
|
} catch {}
|
|
102
102
|
|
|
103
|
-
let fullResponse = "";
|
|
104
103
|
let toolsExecuted = [];
|
|
105
104
|
|
|
105
|
+
// Per-segment streaming: instead of merging the whole turn into one blob, we
|
|
106
|
+
// emit each assistant text piece as its own `segment` (an intro before a tool,
|
|
107
|
+
// then the post-tool answer, …). The renderer renders each as its own bubble
|
|
108
|
+
// and synthesizes its own audio, so a multi-step reply reads as separate spoken
|
|
109
|
+
// messages instead of one run-on bubble. `liveBuf` accumulates streamed tokens
|
|
110
|
+
// (streaming engines) so they can be flushed as a segment at each boundary;
|
|
111
|
+
// for non-streaming models like gemini the text arrives whole via events.
|
|
112
|
+
let segSeq = 0;
|
|
113
|
+
let lastSegText = "";
|
|
114
|
+
let liveBuf = "";
|
|
115
|
+
const emittedSegments = [];
|
|
116
|
+
const emitSegment = (raw) => {
|
|
117
|
+
const seg = (raw || "").trim();
|
|
118
|
+
if (!seg || seg === lastSegText) return;
|
|
119
|
+
lastSegText = seg;
|
|
120
|
+
emittedSegments.push(seg);
|
|
121
|
+
_send(ws, { type: "segment", seq: ++segSeq, text: seg });
|
|
122
|
+
};
|
|
123
|
+
|
|
106
124
|
try {
|
|
107
125
|
if (!isSuperAgentEnabled(config)) {
|
|
108
126
|
throw new Error("super-agent not enabled — set super_agent.enabled + super_agent.model in ~/.apx/config.json");
|
|
@@ -120,10 +138,7 @@ async function _handleMessage({ ws, text, previousMessages }, { projects, config
|
|
|
120
138
|
previousMessages: history.slice(0, -1),
|
|
121
139
|
overrideModel: cfg.model || null,
|
|
122
140
|
signal: controller.signal,
|
|
123
|
-
onToken: (chunk) => {
|
|
124
|
-
fullResponse += chunk;
|
|
125
|
-
_send(ws, { type: "token", text: chunk });
|
|
126
|
-
},
|
|
141
|
+
onToken: (chunk) => { liveBuf += chunk; },
|
|
127
142
|
onEvent: async (event) => {
|
|
128
143
|
if (event.type === "tool_start") {
|
|
129
144
|
const t = event.trace;
|
|
@@ -131,17 +146,24 @@ async function _handleMessage({ ws, text, previousMessages }, { projects, config
|
|
|
131
146
|
_send(ws, { type: "tool_start", name: t.tool, args: t.args });
|
|
132
147
|
} else if (event.type === "tool_result") {
|
|
133
148
|
_send(ws, { type: "tool_done", name: event.trace.tool });
|
|
134
|
-
} else if (event.type === "assistant_text" && event.text
|
|
135
|
-
|
|
136
|
-
|
|
149
|
+
} else if (event.type === "assistant_text" && event.text) {
|
|
150
|
+
// A complete assistant text segment (e.g. the "I'll check…" intro
|
|
151
|
+
// emitted right before a tool runs). Ship it as its own message.
|
|
152
|
+
emitSegment(event.text);
|
|
153
|
+
liveBuf = "";
|
|
137
154
|
}
|
|
138
155
|
},
|
|
139
156
|
});
|
|
140
|
-
|
|
141
|
-
|
|
157
|
+
// The final (no-tool) iteration's answer appears ONLY in result.text (or, for
|
|
158
|
+
// streaming engines, in liveBuf) — it's never emitted as an event. Ship it as
|
|
159
|
+
// the closing segment (deduped against the last one).
|
|
160
|
+
emitSegment((result.text || "").trim() || liveBuf.trim());
|
|
161
|
+
|
|
162
|
+
const finalText = emittedSegments.join("\n\n");
|
|
163
|
+
log(`desktop: super-agent turn done in ${Date.now() - t0}ms segments=${segSeq} text_len=${finalText.length} tools=${toolsExecuted.length}`);
|
|
142
164
|
|
|
143
|
-
//
|
|
144
|
-
_send(ws, { type: "done", text: finalText });
|
|
165
|
+
// Turn end. `segments` lets the renderer know how many bubbles to expect.
|
|
166
|
+
_send(ws, { type: "done", segments: segSeq, text: finalText });
|
|
145
167
|
|
|
146
168
|
// Append assistant turn to history
|
|
147
169
|
if (ws && histories) {
|
|
@@ -481,6 +481,35 @@ export async function preloadWhisperServer(log = console.log) {
|
|
|
481
481
|
}
|
|
482
482
|
}
|
|
483
483
|
|
|
484
|
+
/**
|
|
485
|
+
* Keep the local whisper server warm. Ensures it's loaded and pings /health,
|
|
486
|
+
* which resets the server's idle watchdog so a live session (e.g. the desktop
|
|
487
|
+
* window held open) never pays the cold-load cost on the next utterance.
|
|
488
|
+
* Cheap and safe to call repeatedly. Never throws.
|
|
489
|
+
* Returns { ok, model?, loaded?, provider } for the caller to surface.
|
|
490
|
+
*/
|
|
491
|
+
export async function warmupWhisper() {
|
|
492
|
+
try {
|
|
493
|
+
const cfg = await getConfig();
|
|
494
|
+
if (cfg.provider === "openai") return { ok: true, provider: "openai", loaded: false };
|
|
495
|
+
await ensureWhisperServer(cfg.local);
|
|
496
|
+
// /warmup loads the model into RAM (lazy otherwise) AND touches _last_used,
|
|
497
|
+
// resetting the idle timer. First call may block ~15-30s on a cold model;
|
|
498
|
+
// instant once warm. Generous timeout so the cold load can finish.
|
|
499
|
+
let loaded = false;
|
|
500
|
+
try {
|
|
501
|
+
const r = await fetch(`http://127.0.0.1:${WHISPER_PORT}/warmup`, {
|
|
502
|
+
signal: AbortSignal.timeout(40_000),
|
|
503
|
+
});
|
|
504
|
+
const j = await r.json().catch(() => ({}));
|
|
505
|
+
loaded = !!j.loaded;
|
|
506
|
+
} catch {}
|
|
507
|
+
return { ok: true, provider: "local", model: _serverModel, loaded };
|
|
508
|
+
} catch (e) {
|
|
509
|
+
return { ok: false, error: e.message };
|
|
510
|
+
}
|
|
511
|
+
}
|
|
512
|
+
|
|
484
513
|
/**
|
|
485
514
|
* Stop the whisper server we own (no-op if we adopted an external one).
|
|
486
515
|
*/
|
|
@@ -94,6 +94,17 @@ class _Handler(BaseHTTPRequestHandler):
|
|
|
94
94
|
"model": _model_name or _Handler.model_name,
|
|
95
95
|
"loaded": _model is not None,
|
|
96
96
|
})
|
|
97
|
+
elif self.path == "/warmup":
|
|
98
|
+
# Eagerly load the model into RAM (no audio needed) and reset the
|
|
99
|
+
# idle timer, so the first real transcription isn't cold. Blocks
|
|
100
|
+
# until the model is loaded the first time; instant once warm.
|
|
101
|
+
_touch()
|
|
102
|
+
with _model_lock:
|
|
103
|
+
try:
|
|
104
|
+
_load_model_if_needed(_Handler.model_name, _Handler.device, _Handler.compute_type)
|
|
105
|
+
self._send_json(200, {"ok": True, "loaded": _model is not None, "model": _model_name})
|
|
106
|
+
except Exception as e:
|
|
107
|
+
self._send_json(500, {"ok": False, "error": f"model load failed: {e}"})
|
|
97
108
|
else:
|
|
98
109
|
self._send_json(404, {"ok": False, "error": "not found"})
|
|
99
110
|
|
|
@@ -46,6 +46,20 @@ function getShortcut() {
|
|
|
46
46
|
return cfg?.desktop?.shortcut || cfg?.overlay?.shortcut || DEFAULT_SHORTCUT;
|
|
47
47
|
}
|
|
48
48
|
|
|
49
|
+
// Voice-capture timing for the listening capsule. Overridable in config.json:
|
|
50
|
+
// "desktop": { "silence_ms": 1200, "voice_rms": 0.025 }
|
|
51
|
+
// silence_ms — quiet after speech before auto-send. voice_rms — RMS above
|
|
52
|
+
// which audio counts as voice (lower = more sensitive).
|
|
53
|
+
function getVoiceTiming() {
|
|
54
|
+
const cfg = readApxConfig();
|
|
55
|
+
const d = cfg?.desktop || cfg?.overlay || {};
|
|
56
|
+
const num = (v, def) => (typeof v === "number" && isFinite(v) ? v : def);
|
|
57
|
+
return {
|
|
58
|
+
silence_ms: Math.max(400, num(d.silence_ms, 1200)),
|
|
59
|
+
voice_rms: Math.max(0, num(d.voice_rms, 0.025)),
|
|
60
|
+
};
|
|
61
|
+
}
|
|
62
|
+
|
|
49
63
|
function readToken() {
|
|
50
64
|
try { return fs.readFileSync(TOKEN_PATH, "utf8").trim(); } catch { return ""; }
|
|
51
65
|
}
|
|
@@ -397,6 +411,7 @@ ipcMain.handle("get-shortcut", () => getShortcut());
|
|
|
397
411
|
ipcMain.handle("get-theme", () => getTheme());
|
|
398
412
|
ipcMain.handle("get-position", () => getPosition());
|
|
399
413
|
ipcMain.handle("get-agent-name", () => getAgentName());
|
|
414
|
+
ipcMain.handle("get-voice-timing", () => getVoiceTiming());
|
|
400
415
|
|
|
401
416
|
// Renderer asks main to grow/shrink the window to fit its content.
|
|
402
417
|
// Clamped to [WIN_H_MIN, getMaxWindowHeight()]; same anchor (top edge stays put).
|
|
@@ -411,27 +426,29 @@ ipcMain.on("resize-window", (_e, { height }) => {
|
|
|
411
426
|
// Renderer asks for TTS playback of the agent reply. We synthesize via the
|
|
412
427
|
// daemon and pipe the audio path back as a daemon-event the renderer already
|
|
413
428
|
// knows how to consume (tts-ready { url, duration } / tts-failed).
|
|
414
|
-
ipcMain.handle("request-tts", async (_e, { text }) => {
|
|
429
|
+
ipcMain.handle("request-tts", async (_e, { text, seg }) => {
|
|
415
430
|
if (!text || !text.trim()) {
|
|
416
|
-
mainWindow?.webContents.send("daemon-event", { type: "tts-failed" });
|
|
431
|
+
mainWindow?.webContents.send("daemon-event", { type: "tts-failed", seg });
|
|
417
432
|
return;
|
|
418
433
|
}
|
|
419
434
|
try {
|
|
420
435
|
const result = await daemonTtsSay(text);
|
|
421
436
|
if (result?.ok && result.audio_path) {
|
|
422
437
|
// Expose the local file via file:// — preload's contextIsolation lets
|
|
423
|
-
// the renderer's <audio> tag fetch it directly.
|
|
438
|
+
// the renderer's <audio> tag fetch it directly. `seg` ties this audio to
|
|
439
|
+
// the bubble that asked for it.
|
|
424
440
|
const url = "file://" + result.audio_path;
|
|
425
441
|
mainWindow?.webContents.send("daemon-event", {
|
|
426
442
|
type: "tts-ready",
|
|
443
|
+
seg,
|
|
427
444
|
url,
|
|
428
445
|
duration: result.duration_s || 0,
|
|
429
446
|
});
|
|
430
447
|
} else {
|
|
431
|
-
mainWindow?.webContents.send("daemon-event", { type: "tts-failed", error: result?.error || "no audio" });
|
|
448
|
+
mainWindow?.webContents.send("daemon-event", { type: "tts-failed", seg, error: result?.error || "no audio" });
|
|
432
449
|
}
|
|
433
450
|
} catch (e) {
|
|
434
|
-
mainWindow?.webContents.send("daemon-event", { type: "tts-failed", error: e.message });
|
|
451
|
+
mainWindow?.webContents.send("daemon-event", { type: "tts-failed", seg, error: e.message });
|
|
435
452
|
}
|
|
436
453
|
});
|
|
437
454
|
|
|
@@ -462,6 +479,32 @@ ipcMain.handle("check-whisper-ready", () => {
|
|
|
462
479
|
});
|
|
463
480
|
});
|
|
464
481
|
|
|
482
|
+
// Renderer asks to keep STT warm. Routed through the daemon (not whisper
|
|
483
|
+
// directly) so it both LOADS the model if it idled out and resets the idle
|
|
484
|
+
// watchdog. Fire-and-forget from the renderer's side.
|
|
485
|
+
ipcMain.handle("warmup-stt", async () => {
|
|
486
|
+
return new Promise((resolve) => {
|
|
487
|
+
const token = readToken();
|
|
488
|
+
const options = {
|
|
489
|
+
hostname: DAEMON_HOST,
|
|
490
|
+
port: DAEMON_PORT,
|
|
491
|
+
path: "/transcribe/warmup",
|
|
492
|
+
method: "GET",
|
|
493
|
+
headers: { ...(token ? { "Authorization": `Bearer ${token}` } : {}) },
|
|
494
|
+
};
|
|
495
|
+
const req = http.request(options, (res) => {
|
|
496
|
+
let data = "";
|
|
497
|
+
res.on("data", (c) => data += c);
|
|
498
|
+
res.on("end", () => { try { resolve(JSON.parse(data)); } catch { resolve({ ok: false }); } });
|
|
499
|
+
});
|
|
500
|
+
req.on("error", () => resolve({ ok: false }));
|
|
501
|
+
// Cold model load can take ~30s; give it room. (Renderer fires this
|
|
502
|
+
// fire-and-forget, so a long warm-up never blocks the UI.)
|
|
503
|
+
req.setTimeout(45000, () => { req.destroy(); resolve({ ok: false }); });
|
|
504
|
+
req.end();
|
|
505
|
+
});
|
|
506
|
+
});
|
|
507
|
+
|
|
465
508
|
// Renderer requests recording toggle (ESC cancels, shortcut toggles)
|
|
466
509
|
ipcMain.handle("toggle-recording", async () => {
|
|
467
510
|
if (isRecording) stopRecording(); else startRecording();
|
|
@@ -18,14 +18,19 @@ contextBridge.exposeInMainWorld("apx", {
|
|
|
18
18
|
// Check if the whisper model is loaded (false = still loading)
|
|
19
19
|
checkWhisperReady: () => ipcRenderer.invoke("check-whisper-ready"),
|
|
20
20
|
|
|
21
|
+
// Keep STT warm (loads the model if idle + resets the idle timer). Called
|
|
22
|
+
// while the window is open / on mic-open so the first decode isn't cold.
|
|
23
|
+
warmupStt: () => ipcRenderer.invoke("warmup-stt").catch(() => ({ ok: false })),
|
|
24
|
+
|
|
21
25
|
// Send final text to daemon
|
|
22
26
|
sendMessage: (text, previousMessages) =>
|
|
23
27
|
ipcRenderer.invoke("send-message", { text, previousMessages }),
|
|
24
28
|
|
|
25
|
-
//
|
|
26
|
-
//
|
|
27
|
-
|
|
28
|
-
|
|
29
|
+
// Ask main to synthesize TTS for one segment. `seg` correlates the resulting
|
|
30
|
+
// tts-ready/tts-failed event back to the bubble that requested it (each
|
|
31
|
+
// assistant message has its own audio). Returns true optimistically.
|
|
32
|
+
requestTts: (text, seg) => {
|
|
33
|
+
ipcRenderer.invoke("request-tts", { text, seg }).catch(() => {});
|
|
29
34
|
return true; // optimistic; renderer waits for the event either way
|
|
30
35
|
},
|
|
31
36
|
|
|
@@ -42,6 +47,7 @@ contextBridge.exposeInMainWorld("apx", {
|
|
|
42
47
|
getTheme: () => ipcRenderer.invoke("get-theme"),
|
|
43
48
|
getPosition: () => ipcRenderer.invoke("get-position"),
|
|
44
49
|
getAgentName: () => ipcRenderer.invoke("get-agent-name"),
|
|
50
|
+
getVoiceTiming: () => ipcRenderer.invoke("get-voice-timing"),
|
|
45
51
|
|
|
46
52
|
// Renderer asks main to resize the BrowserWindow to the rendered height
|
|
47
53
|
resize: (height) => ipcRenderer.send("resize-window", { height }),
|
|
@@ -31,15 +31,49 @@
|
|
|
31
31
|
let recorderFormat = "webm";
|
|
32
32
|
let liveBusy = false;
|
|
33
33
|
|
|
34
|
+
// Mic is async to open (getUserMedia + recorder warm-up). Until it's actually
|
|
35
|
+
// capturing we show a "Cargando…" state instead of the wave, so the user
|
|
36
|
+
// doesn't talk into the dead gap before the recorder starts.
|
|
37
|
+
let micReady = false;
|
|
38
|
+
|
|
39
|
+
// Silence auto-send: once speech has been heard, SILENCE_MS of quiet
|
|
40
|
+
// auto-commits the recording. RMS (time-domain) is the voice/silence gate.
|
|
41
|
+
// Both are overridable from config.json (desktop.silence_ms / voice_rms).
|
|
42
|
+
let speechSeen = false;
|
|
43
|
+
let lastVoiceTs = 0;
|
|
44
|
+
let SILENCE_MS = 1200; // quiet after speech → send on its own
|
|
45
|
+
let VOICE_RMS = 0.025; // RMS above this counts as voice (0 = silence)
|
|
46
|
+
const PAUSE_PREVIEW_MS = 600; // a short pause kicks ONE decode (reused on send)
|
|
47
|
+
|
|
48
|
+
// When a pause triggers a preview decode, that decode already covers all the
|
|
49
|
+
// speech (the tail is just trailing silence), so the auto-send reuses it
|
|
50
|
+
// instead of paying a second full decode. These coordinate that handoff.
|
|
51
|
+
let pausePreviewed = false; // a preview decode fired for the current pause
|
|
52
|
+
let reuseLiveOnStop = false; // commit should reuse pendingUserText, not re-decode
|
|
53
|
+
let livePromise = null; // in-flight preview decode (awaited on reuse)
|
|
54
|
+
|
|
34
55
|
// Web Audio analyser — drives the live capsule wave from real mic amplitude
|
|
35
56
|
let audioCtx = null;
|
|
36
57
|
let analyser = null;
|
|
37
58
|
let freqData = null;
|
|
59
|
+
let timeData = null;
|
|
38
60
|
let waveRaf = null;
|
|
39
61
|
|
|
40
|
-
let streamingAgentEntry = null; //
|
|
41
|
-
let toolPillsByName = {}; // active tool pills
|
|
42
|
-
let ttsAudio = null; // <audio> playing
|
|
62
|
+
let streamingAgentEntry = null; // legacy single-bubble streaming (kept dormant)
|
|
63
|
+
let toolPillsByName = {}; // active tool pills, by tool name, for the live turn
|
|
64
|
+
let ttsAudio = null; // <audio> currently playing
|
|
65
|
+
|
|
66
|
+
// ── Per-segment turn rendering ──────────────────────────────────────────
|
|
67
|
+
// A turn is now N agent message bubbles (intro, post-tool answer, …), each
|
|
68
|
+
// with its own audio. `currentTurn` tags every bubble of a turn so regen can
|
|
69
|
+
// drop the whole turn. The audio queue plays segment audios in seq order
|
|
70
|
+
// (gapless auto-play), waiting at the cursor for each segment's TTS to land.
|
|
71
|
+
let currentTurn = 0;
|
|
72
|
+
let turnAudios = []; // [{ m, ready, failed, played }] ordered by seq
|
|
73
|
+
let audioCursor = 0; // index of the next segment to play
|
|
74
|
+
let queuePlaying = false; // a segment audio is currently playing
|
|
75
|
+
let turnDone = false; // `done` received for the active turn
|
|
76
|
+
let turnWatchdog = null; // flushes the queue if a segment's TTS hangs
|
|
43
77
|
|
|
44
78
|
let history = []; // [{role:'user'|'assistant', content}] sent to daemon for context
|
|
45
79
|
let theme = "light";
|
|
@@ -119,10 +153,15 @@
|
|
|
119
153
|
window.apx?.getPosition?.() ?? "right",
|
|
120
154
|
window.apx?.getShortcut?.() ?? "CommandOrControl+G",
|
|
121
155
|
window.apx?.getAgentName?.() ?? "Superagente",
|
|
122
|
-
|
|
156
|
+
window.apx?.getVoiceTiming?.() ?? null,
|
|
157
|
+
]).then(([th, pos, shortcut, name, timing]) => {
|
|
123
158
|
theme = th || "light";
|
|
124
159
|
position = pos || "right";
|
|
125
160
|
agentName = (name && String(name).trim()) || "Superagente";
|
|
161
|
+
if (timing) {
|
|
162
|
+
if (typeof timing.silence_ms === "number") SILENCE_MS = timing.silence_ms;
|
|
163
|
+
if (typeof timing.voice_rms === "number") VOICE_RMS = timing.voice_rms;
|
|
164
|
+
}
|
|
126
165
|
document.documentElement.setAttribute("data-theme", theme);
|
|
127
166
|
setPosition(position);
|
|
128
167
|
initialCaption(shortcut);
|
|
@@ -217,6 +256,13 @@
|
|
|
217
256
|
}
|
|
218
257
|
}
|
|
219
258
|
// else: input already there → leave it alone (preserves focus + caret)
|
|
259
|
+
} else if (mode === "listening" && !micReady) {
|
|
260
|
+
// Mic still opening (getUserMedia + recorder warm-up). Show a loading
|
|
261
|
+
// status so the user waits for capture instead of talking into the gap.
|
|
262
|
+
if ($capCenter.dataset.mode !== "loading") {
|
|
263
|
+
$capCenter.dataset.mode = "loading";
|
|
264
|
+
$capCenter.innerHTML = `<span class="status"><span class="dots"><i></i><i></i><i></i></span><span class="shimmer">Cargando…</span></span>`;
|
|
265
|
+
}
|
|
220
266
|
} else if (mode === "listening") {
|
|
221
267
|
// Only rebuild the wave if it's not already there (avoids restarting
|
|
222
268
|
// CSS animations / Web Audio binding every render).
|
|
@@ -246,9 +292,10 @@
|
|
|
246
292
|
}
|
|
247
293
|
}
|
|
248
294
|
}
|
|
249
|
-
// Clear data-mode when we're back to idle
|
|
250
|
-
// re-renders correctly.
|
|
251
|
-
|
|
295
|
+
// Clear data-mode when we're back to idle, or once the live wave is up, so
|
|
296
|
+
// a future busy mode re-renders correctly. While the mic is still warming
|
|
297
|
+
// up we keep the "loading" marker so "Cargando…" isn't rebuilt every frame.
|
|
298
|
+
if (mode === "idle" || (mode === "listening" && micReady)) $capCenter.dataset.mode = "";
|
|
252
299
|
|
|
253
300
|
// actions
|
|
254
301
|
$capActions.innerHTML = "";
|
|
@@ -273,7 +320,8 @@
|
|
|
273
320
|
}
|
|
274
321
|
} else if (mode === "listening") {
|
|
275
322
|
addBtn("ghost", "Cancelar", ICON.x(), () => cancel());
|
|
276
|
-
|
|
323
|
+
// No "Enviar" until the recorder is live — nothing to send mid-warm-up.
|
|
324
|
+
if (micReady) addBtn("", "Enviar", ICON.send(), () => stopListening(/* commit */ true));
|
|
277
325
|
} else if (mode === "transcribing") {
|
|
278
326
|
addBtn("ghost", "Cancelar", ICON.x(), () => cancel());
|
|
279
327
|
} else if (mode === "thinking") {
|
|
@@ -321,7 +369,6 @@
|
|
|
321
369
|
// Re-render all existing turns
|
|
322
370
|
messages.forEach((m, i) => appendTurn(m, i === messages.length - 1));
|
|
323
371
|
if (mode === "transcribing") renderPendingUserPartial();
|
|
324
|
-
if (mode === "thinking" || mode === "speaking") ensureStreamingAgentBubble();
|
|
325
372
|
}
|
|
326
373
|
}
|
|
327
374
|
|
|
@@ -342,17 +389,28 @@
|
|
|
342
389
|
<div class="bubble-user">${escapeHtml(m.text)}${viaIcon}</div>
|
|
343
390
|
`;
|
|
344
391
|
} else {
|
|
345
|
-
|
|
392
|
+
// Consecutive agent messages (intro + post-tool answer …) read as one
|
|
393
|
+
// continued reply: only the FIRST shows the "Roby" header — the rest skip
|
|
394
|
+
// it so a tool turn isn't a stack of repeated "Roby" labels. A new header
|
|
395
|
+
// only appears when something (a user message) breaks the run.
|
|
396
|
+
const idx = messages.indexOf(m);
|
|
397
|
+
const prevMsg = idx > 0 ? messages[idx - 1] : null;
|
|
398
|
+
const agentCont = !!(prevMsg && prevMsg.role === "agent");
|
|
399
|
+
if (agentCont) t.classList.add("cont");
|
|
400
|
+
const header = agentCont ? "" : `
|
|
346
401
|
<div class="role agent">
|
|
347
402
|
<span class="ava sa"><img src="assets/superagent.png" alt=""/></span>
|
|
348
403
|
<span class="who">${escapeHtml(agentName)}</span>
|
|
349
404
|
<span class="time">${m.t || ""}</span>
|
|
350
|
-
</div
|
|
351
|
-
|
|
352
|
-
|
|
405
|
+
</div>`;
|
|
406
|
+
// Copy is an inline icon at the end of the text, hover-only, so it never
|
|
407
|
+
// reserves an empty row. Regenerate lives in turn-actions and CSS shows it
|
|
408
|
+
// only on the last turn.
|
|
409
|
+
t.innerHTML = `
|
|
410
|
+
${header}
|
|
411
|
+
<div class="msg-agent">${formatWordsHtml(m.text)}<button class="btn-copy" aria-label="Copiar" title="Copiar">${ICON.copy()}</button></div>
|
|
353
412
|
<div class="turn-actions">
|
|
354
413
|
<button class="chip btn-regen">${ICON.refresh()} Regenerar</button>
|
|
355
|
-
<button class="chip btn-copy">${ICON.copy()} Copiar</button>
|
|
356
414
|
</div>
|
|
357
415
|
`;
|
|
358
416
|
if (m.audio && m.dur) {
|
|
@@ -362,13 +420,13 @@
|
|
|
362
420
|
actions.insertAdjacentHTML("beforebegin", scrubberHtml);
|
|
363
421
|
wireScrubber(t, m);
|
|
364
422
|
}
|
|
365
|
-
// copy
|
|
423
|
+
// copy (inline icon → swaps to a check briefly)
|
|
366
424
|
t.querySelector(".btn-copy")?.addEventListener("click", (e) => {
|
|
367
425
|
navigator.clipboard?.writeText(m.text).catch(() => {});
|
|
368
426
|
const btn = e.currentTarget;
|
|
369
427
|
btn.classList.add("done");
|
|
370
|
-
btn.innerHTML =
|
|
371
|
-
setTimeout(() => { btn.classList.remove("done"); btn.innerHTML =
|
|
428
|
+
btn.innerHTML = ICON.check();
|
|
429
|
+
setTimeout(() => { btn.classList.remove("done"); btn.innerHTML = ICON.copy(); }, 1400);
|
|
372
430
|
});
|
|
373
431
|
// regen: only the LAST agent turn can be regenerated. Past turns
|
|
374
432
|
// can't because we'd have to re-issue the user prompt that came right
|
|
@@ -388,7 +446,10 @@
|
|
|
388
446
|
if (history.length && history[history.length - 1].role === "assistant") {
|
|
389
447
|
history.pop();
|
|
390
448
|
}
|
|
391
|
-
|
|
449
|
+
// A turn can be several agent bubbles (intro + post-tool answer…); drop
|
|
450
|
+
// them all so regen replaces the whole turn, not just the last segment.
|
|
451
|
+
const turnId = m.turn;
|
|
452
|
+
messages = messages.filter((x) => !(x.role === "agent" && turnId != null && x.turn === turnId) && x.id !== m.id);
|
|
392
453
|
rebuildConvFromState();
|
|
393
454
|
startAgentTurn();
|
|
394
455
|
sendToDaemon(lastUser.text);
|
|
@@ -499,12 +560,14 @@
|
|
|
499
560
|
}
|
|
500
561
|
|
|
501
562
|
function addToolPill(name) {
|
|
502
|
-
|
|
503
|
-
if (toolPillsByName[name]) return;
|
|
563
|
+
ensureConv();
|
|
564
|
+
if (!$convScroll || toolPillsByName[name]) return;
|
|
504
565
|
const pill = document.createElement("div");
|
|
505
566
|
pill.className = "tool-pill";
|
|
506
567
|
pill.innerHTML = `<div class="spinner"></div><span>${escapeHtml(name)}</span>`;
|
|
507
|
-
|
|
568
|
+
// Append at the end of the conversation flow — pills sit between the
|
|
569
|
+
// segment bubbles in the order tools actually run.
|
|
570
|
+
$convScroll.appendChild(pill);
|
|
508
571
|
toolPillsByName[name] = pill;
|
|
509
572
|
scrollConvToBottom();
|
|
510
573
|
}
|
|
@@ -541,53 +604,46 @@
|
|
|
541
604
|
const dur = m.dur || 1;
|
|
542
605
|
const fmt = (s) => `0:${String(Math.round(s)).padStart(2, "0")}`;
|
|
543
606
|
const audio = new Audio(m.audio);
|
|
607
|
+
m._audioEl = audio; // the audio queue drives sequential playback
|
|
544
608
|
let raf = null;
|
|
545
|
-
let progress = 0;
|
|
546
609
|
|
|
547
610
|
const setProgress = (p) => {
|
|
548
|
-
|
|
549
|
-
const cur = Math.floor(
|
|
611
|
+
p = Math.max(0, Math.min(1, p));
|
|
612
|
+
const cur = Math.floor(p * N);
|
|
550
613
|
bars.forEach((b, i) => {
|
|
551
614
|
b.classList.toggle("on", i <= cur);
|
|
552
615
|
b.classList.toggle("cur", i === cur && !audio.paused);
|
|
553
616
|
});
|
|
554
|
-
$dur.textContent =
|
|
617
|
+
$dur.textContent = p > 0 || !audio.paused ? fmt(p * dur) : fmt(dur);
|
|
555
618
|
};
|
|
556
|
-
|
|
557
619
|
const tick = () => {
|
|
558
620
|
if (audio.duration > 0) setProgress(audio.currentTime / audio.duration);
|
|
559
621
|
raf = requestAnimationFrame(tick);
|
|
560
622
|
};
|
|
561
|
-
audio.addEventListener("play",
|
|
562
|
-
audio.addEventListener("pause",
|
|
563
|
-
audio.addEventListener("ended",
|
|
623
|
+
audio.addEventListener("play", () => { $play.innerHTML = ICON.pause(); raf = requestAnimationFrame(tick); if (mode !== "speaking") { mode = "speaking"; render(); } });
|
|
624
|
+
audio.addEventListener("pause", () => { $play.innerHTML = ICON.play(); if (raf) cancelAnimationFrame(raf); });
|
|
625
|
+
audio.addEventListener("ended", () => { $play.innerHTML = ICON.play(); if (raf) cancelAnimationFrame(raf); setProgress(1); onSegmentEnded(m); });
|
|
626
|
+
// 404 / decode error / autoplay block: don't hang — advance the queue.
|
|
627
|
+
audio.addEventListener("error", () => onSegmentEnded(m));
|
|
564
628
|
|
|
565
|
-
$play.addEventListener("click", () =>
|
|
629
|
+
$play.addEventListener("click", () => {
|
|
630
|
+
if (audio.paused) {
|
|
631
|
+
// Manual play takes control — stop the auto-sequence so we don't fight it.
|
|
632
|
+
queuePlaying = false;
|
|
633
|
+
try { if (ttsAudio && ttsAudio !== audio && !ttsAudio.ended) ttsAudio.pause(); } catch {}
|
|
634
|
+
ttsAudio = audio;
|
|
635
|
+
audio.play().catch(() => { if (mode === "speaking") { mode = "idle"; render(); } });
|
|
636
|
+
} else {
|
|
637
|
+
audio.pause();
|
|
638
|
+
if (mode === "speaking") { mode = "idle"; render(); }
|
|
639
|
+
}
|
|
640
|
+
});
|
|
566
641
|
$bar.addEventListener("click", (e) => {
|
|
567
642
|
const r = $bar.getBoundingClientRect();
|
|
568
643
|
const p = Math.max(0, Math.min(1, (e.clientX - r.left) / r.width));
|
|
569
644
|
if (audio.duration > 0) audio.currentTime = p * audio.duration;
|
|
570
645
|
setProgress(p);
|
|
571
646
|
});
|
|
572
|
-
|
|
573
|
-
// If the audio errors out (404, decode error, autoplay block, etc) make
|
|
574
|
-
// sure the capsule doesn't stay stuck in "está hablando…".
|
|
575
|
-
audio.addEventListener("error", () => {
|
|
576
|
-
if (mode === "speaking") { mode = "idle"; render(); }
|
|
577
|
-
});
|
|
578
|
-
|
|
579
|
-
// autoplay if it's the fresh reply
|
|
580
|
-
if (m.fresh) {
|
|
581
|
-
m.fresh = false;
|
|
582
|
-
ttsAudio?.pause?.();
|
|
583
|
-
ttsAudio = audio;
|
|
584
|
-
audio.play().catch(() => {
|
|
585
|
-
// Autoplay block (rare in Electron with user-gesture but possible
|
|
586
|
-
// when the window has never been focused). Bail out so the capsule
|
|
587
|
-
// returns to idle and the user can still tap "play" on the scrubber.
|
|
588
|
-
if (mode === "speaking" || mode === "thinking") { mode = "idle"; render(); }
|
|
589
|
-
});
|
|
590
|
-
}
|
|
591
647
|
}
|
|
592
648
|
|
|
593
649
|
// Post-finalize hook: add a scrubber to an already-rendered agent turn
|
|
@@ -597,15 +653,18 @@
|
|
|
597
653
|
if (!m) return;
|
|
598
654
|
m.audio = url;
|
|
599
655
|
m.dur = dur || 0;
|
|
600
|
-
m.fresh = true; // autoplay the freshly-arrived reply
|
|
601
656
|
const turnEl = $convScroll?.querySelector(`[data-id="${turnId}"]`);
|
|
602
|
-
if (!turnEl)
|
|
603
|
-
|
|
604
|
-
|
|
605
|
-
|
|
606
|
-
|
|
607
|
-
|
|
608
|
-
|
|
657
|
+
if (turnEl && !turnEl.querySelector(".audio")) {
|
|
658
|
+
// Insert the scrubber HTML just before turn-actions (matches appendTurn).
|
|
659
|
+
const actions = turnEl.querySelector(".turn-actions");
|
|
660
|
+
const html = buildScrubberHtml(m);
|
|
661
|
+
if (actions) actions.insertAdjacentHTML("beforebegin", html);
|
|
662
|
+
else turnEl.insertAdjacentHTML("beforeend", html);
|
|
663
|
+
wireScrubber(turnEl, m); // sets m._audioEl
|
|
664
|
+
}
|
|
665
|
+
// Audio is ready → let the sequential queue play it when it's this
|
|
666
|
+
// segment's turn (gapless auto-play across the turn's bubbles).
|
|
667
|
+
queueMarkReady(m);
|
|
609
668
|
scrollConvToBottom();
|
|
610
669
|
}
|
|
611
670
|
|
|
@@ -620,10 +679,94 @@
|
|
|
620
679
|
return out;
|
|
621
680
|
}
|
|
622
681
|
|
|
682
|
+
// ── Per-turn setup + sequential audio queue ──────────────────────────────
|
|
683
|
+
// Each turn renders N agent bubbles (segments), each with its own audio. We
|
|
684
|
+
// play those audios in `seq` order, gaplessly: the cursor waits at a segment
|
|
685
|
+
// until its TTS lands, plays it, then advances. So Roby "speaks" its messages
|
|
686
|
+
// one after another even though they synthesize at different speeds.
|
|
687
|
+
function beginAgentTurn() {
|
|
688
|
+
currentTurn++;
|
|
689
|
+
resetTurnAudio();
|
|
690
|
+
doneHandled = false;
|
|
691
|
+
pendingTtsTurnId = null;
|
|
692
|
+
if (ttsTimer) { clearTimeout(ttsTimer); ttsTimer = null; }
|
|
693
|
+
}
|
|
694
|
+
function resetTurnAudio() {
|
|
695
|
+
try { ttsAudio?.pause?.(); } catch {}
|
|
696
|
+
ttsAudio = null;
|
|
697
|
+
turnAudios = [];
|
|
698
|
+
audioCursor = 0;
|
|
699
|
+
queuePlaying = false;
|
|
700
|
+
turnDone = false;
|
|
701
|
+
if (turnWatchdog) { clearTimeout(turnWatchdog); turnWatchdog = null; }
|
|
702
|
+
}
|
|
703
|
+
function queueRegisterSegment(m) {
|
|
704
|
+
if (!turnAudios.some((e) => e.m === m)) {
|
|
705
|
+
turnAudios.push({ m, ready: false, failed: false, played: false });
|
|
706
|
+
turnAudios.sort((a, b) => (a.m.seq || 0) - (b.m.seq || 0));
|
|
707
|
+
}
|
|
708
|
+
}
|
|
709
|
+
function queueMarkReady(m) {
|
|
710
|
+
const e = turnAudios.find((x) => x.m === m);
|
|
711
|
+
if (e) e.ready = true;
|
|
712
|
+
pumpAudioQueue();
|
|
713
|
+
}
|
|
714
|
+
function queueMarkFailed(m) {
|
|
715
|
+
const e = turnAudios.find((x) => x.m === m);
|
|
716
|
+
if (e) { e.ready = true; e.failed = true; e.played = true; }
|
|
717
|
+
pumpAudioQueue();
|
|
718
|
+
}
|
|
719
|
+
function pumpAudioQueue() {
|
|
720
|
+
if (queuePlaying) return;
|
|
721
|
+
while (audioCursor < turnAudios.length) {
|
|
722
|
+
const e = turnAudios[audioCursor];
|
|
723
|
+
if (!e.ready) return; // wait for this segment's TTS
|
|
724
|
+
if (e.played || e.failed || !e.m._audioEl) { audioCursor++; continue; }
|
|
725
|
+
const audio = e.m._audioEl;
|
|
726
|
+
queuePlaying = true;
|
|
727
|
+
try { if (ttsAudio && ttsAudio !== audio && !ttsAudio.ended) ttsAudio.pause(); } catch {}
|
|
728
|
+
ttsAudio = audio;
|
|
729
|
+
audio.play().catch(() => { // autoplay blocked / decode error
|
|
730
|
+
queuePlaying = false;
|
|
731
|
+
e.played = true;
|
|
732
|
+
audioCursor++;
|
|
733
|
+
pumpAudioQueue();
|
|
734
|
+
});
|
|
735
|
+
return;
|
|
736
|
+
}
|
|
737
|
+
// Drained. Once the turn is done and nothing's left, return to idle.
|
|
738
|
+
if (turnDone) {
|
|
739
|
+
if (turnWatchdog) { clearTimeout(turnWatchdog); turnWatchdog = null; }
|
|
740
|
+
if (mode === "speaking" || mode === "thinking") { mode = "idle"; render(); }
|
|
741
|
+
}
|
|
742
|
+
}
|
|
743
|
+
// Called from a segment audio's `ended` (or `error`). Advances the queue.
|
|
744
|
+
function onSegmentEnded(m) {
|
|
745
|
+
const e = turnAudios.find((x) => x.m === m);
|
|
746
|
+
if (e) { if (e.played) return; e.played = true; }
|
|
747
|
+
if (queuePlaying && ttsAudio === m._audioEl) {
|
|
748
|
+
queuePlaying = false;
|
|
749
|
+
audioCursor++;
|
|
750
|
+
pumpAudioQueue();
|
|
751
|
+
} else if (mode === "speaking") {
|
|
752
|
+
mode = "idle"; render();
|
|
753
|
+
}
|
|
754
|
+
}
|
|
755
|
+
|
|
623
756
|
// ── Recording flow ───────────────────────────────────────────────────────
|
|
624
757
|
function startListening() {
|
|
625
758
|
if (mode !== "idle") return;
|
|
626
759
|
isCancelled = false;
|
|
760
|
+
micReady = false; // show "Cargando…" until the recorder is actually live
|
|
761
|
+
speechSeen = false;
|
|
762
|
+
lastVoiceTs = 0;
|
|
763
|
+
pausePreviewed = false;
|
|
764
|
+
reuseLiveOnStop = false;
|
|
765
|
+
livePromise = null;
|
|
766
|
+
pendingUserText = "";
|
|
767
|
+
// Warm the whisper model now (overlaps the mic warm-up), so the decode at
|
|
768
|
+
// the end of this utterance doesn't pay a cold start.
|
|
769
|
+
window.apx?.warmupStt?.();
|
|
627
770
|
mode = "listening";
|
|
628
771
|
render();
|
|
629
772
|
startMic();
|
|
@@ -649,6 +792,7 @@
|
|
|
649
792
|
if (mode === "listening") { stopMic(); }
|
|
650
793
|
if (mode === "thinking" || mode === "speaking") { window.apx?.cancel?.(); }
|
|
651
794
|
removePendingUserPartial();
|
|
795
|
+
resetTurnAudio(); // stop any playing/queued segment audio
|
|
652
796
|
if (streamingAgentEntry) {
|
|
653
797
|
streamingAgentEntry.el.remove();
|
|
654
798
|
streamingAgentEntry = null;
|
|
@@ -658,6 +802,8 @@
|
|
|
658
802
|
}
|
|
659
803
|
|
|
660
804
|
function stopSpeaking() {
|
|
805
|
+
// Halt the auto-sequence and the current segment.
|
|
806
|
+
queuePlaying = false;
|
|
661
807
|
try { ttsAudio?.pause?.(); } catch {}
|
|
662
808
|
if (mode === "speaking") { mode = "idle"; render(); }
|
|
663
809
|
}
|
|
@@ -678,6 +824,7 @@
|
|
|
678
824
|
analyser.maxDecibels = -15; // ceiling (loud speech)
|
|
679
825
|
src.connect(analyser);
|
|
680
826
|
freqData = new Uint8Array(analyser.frequencyBinCount);
|
|
827
|
+
timeData = new Uint8Array(analyser.fftSize);
|
|
681
828
|
startWaveLoop();
|
|
682
829
|
} catch (e) {
|
|
683
830
|
console.warn("desktop renderer: AnalyserNode init failed", e);
|
|
@@ -691,14 +838,27 @@
|
|
|
691
838
|
recordedChunks = [];
|
|
692
839
|
mediaRecorder = new MediaRecorder(audioStream, { mimeType, audioBitsPerSecond: 32000 });
|
|
693
840
|
mediaRecorder.ondataavailable = (e) => {
|
|
841
|
+
// Just buffer. We deliberately do NOT decode on every chunk anymore —
|
|
842
|
+
// re-decoding the growing clip every 2s serialized on the single
|
|
843
|
+
// whisper thread and the final decode queued behind it (the old ~10s
|
|
844
|
+
// stall). Transcription now happens once, on a pause / on stop.
|
|
694
845
|
if (e.data && e.data.size > 0) recordedChunks.push(e.data);
|
|
695
|
-
runLivePartial();
|
|
696
846
|
};
|
|
697
847
|
mediaRecorder.onstop = async () => {
|
|
698
848
|
if (isCancelled) { recordedChunks = []; if (mode !== "idle") { mode = "idle"; render(); } return; }
|
|
699
|
-
|
|
700
|
-
|
|
849
|
+
let text = "";
|
|
850
|
+
// Auto-send after a pause: the pause already kicked a full decode that
|
|
851
|
+
// covers all the speech (the only thing after it is trailing silence),
|
|
852
|
+
// so reuse it instead of decoding the same audio again. Await the
|
|
853
|
+
// in-flight preview if it hasn't settled yet.
|
|
854
|
+
if (reuseLiveOnStop) {
|
|
855
|
+
if (livePromise) { try { await livePromise; } catch {} }
|
|
856
|
+
text = (pendingUserText || "").trim();
|
|
857
|
+
}
|
|
858
|
+
// Manual send (Enviar / ⌘G release) or no preview yet → one fresh decode.
|
|
859
|
+
if (!text) text = (await transcribeBuffered()).trim();
|
|
701
860
|
recordedChunks = [];
|
|
861
|
+
reuseLiveOnStop = false;
|
|
702
862
|
// Guard with .trim() — whisper occasionally returns a single space or
|
|
703
863
|
// newline for very short clips, which used to commit an empty bubble.
|
|
704
864
|
if (!text || isCancelled) {
|
|
@@ -711,7 +871,16 @@
|
|
|
711
871
|
pendingUserText = text;
|
|
712
872
|
commitUserMessage(text, /* via */ "voice");
|
|
713
873
|
};
|
|
714
|
-
|
|
874
|
+
// 1s timeslice: chunks land often enough that a pause-preview decode has
|
|
875
|
+
// audio to work with even for short utterances. We no longer decode per
|
|
876
|
+
// chunk (just buffer), so a smaller slice is essentially free.
|
|
877
|
+
mediaRecorder.start(1000);
|
|
878
|
+
// Recorder is now live → swap "Cargando…" for the reactive wave and let
|
|
879
|
+
// silence detection arm. lastVoiceTs starts now so a fully silent open
|
|
880
|
+
// won't auto-send (speechSeen gates that).
|
|
881
|
+
micReady = true;
|
|
882
|
+
lastVoiceTs = Date.now();
|
|
883
|
+
if (mode === "listening") render();
|
|
715
884
|
} catch (e) {
|
|
716
885
|
console.error("desktop renderer: mic error", e);
|
|
717
886
|
mode = "idle";
|
|
@@ -723,11 +892,16 @@
|
|
|
723
892
|
try { audioStream?.getTracks().forEach((t) => t.stop()); } catch {}
|
|
724
893
|
mediaRecorder = null;
|
|
725
894
|
audioStream = null;
|
|
895
|
+
micReady = false;
|
|
896
|
+
speechSeen = false;
|
|
897
|
+
lastVoiceTs = 0;
|
|
898
|
+
pausePreviewed = false;
|
|
726
899
|
stopWaveLoop();
|
|
727
900
|
try { audioCtx?.close(); } catch {}
|
|
728
901
|
audioCtx = null;
|
|
729
902
|
analyser = null;
|
|
730
903
|
freqData = null;
|
|
904
|
+
timeData = null;
|
|
731
905
|
}
|
|
732
906
|
|
|
733
907
|
// ── Reactive wave: amplitude-driven bar heights (runs while mode === listening)
|
|
@@ -738,6 +912,43 @@
|
|
|
738
912
|
const tick = () => {
|
|
739
913
|
if (mode !== "listening" || !analyser) { waveRaf = null; return; }
|
|
740
914
|
analyser.getByteFrequencyData(freqData);
|
|
915
|
+
|
|
916
|
+
// ── Silence auto-send ──────────────────────────────────────────────
|
|
917
|
+
// Time-domain RMS is a reliable voice/silence gate (unlike the freq
|
|
918
|
+
// bars, it's independent of the analyser's dB scaling). Once we've heard
|
|
919
|
+
// speech, SILENCE_MS of quiet commits the recording on its own.
|
|
920
|
+
if (micReady && timeData) {
|
|
921
|
+
analyser.getByteTimeDomainData(timeData);
|
|
922
|
+
let sumSq = 0;
|
|
923
|
+
for (let i = 0; i < timeData.length; i++) {
|
|
924
|
+
const v = (timeData[i] - 128) / 128;
|
|
925
|
+
sumSq += v * v;
|
|
926
|
+
}
|
|
927
|
+
const rms = Math.sqrt(sumSq / timeData.length);
|
|
928
|
+
const now = Date.now();
|
|
929
|
+
if (rms > VOICE_RMS) {
|
|
930
|
+
speechSeen = true;
|
|
931
|
+
lastVoiceTs = now;
|
|
932
|
+
pausePreviewed = false; // new speech → allow a fresh preview
|
|
933
|
+
} else if (speechSeen && lastVoiceTs) {
|
|
934
|
+
const silentFor = now - lastVoiceTs;
|
|
935
|
+
// A short pause kicks ONE decode of everything said so far. It doubles
|
|
936
|
+
// as the final transcription, so the auto-send below is instant
|
|
937
|
+
// instead of paying a decode after stop.
|
|
938
|
+
if (!pausePreviewed && silentFor >= PAUSE_PREVIEW_MS && !liveBusy) {
|
|
939
|
+
pausePreviewed = true;
|
|
940
|
+
runLivePartial();
|
|
941
|
+
}
|
|
942
|
+
// Sustained silence → auto-send, reusing the pause decode.
|
|
943
|
+
if (silentFor >= SILENCE_MS) {
|
|
944
|
+
waveRaf = null;
|
|
945
|
+
reuseLiveOnStop = true;
|
|
946
|
+
stopListening(/* commit */ true);
|
|
947
|
+
return;
|
|
948
|
+
}
|
|
949
|
+
}
|
|
950
|
+
}
|
|
951
|
+
|
|
741
952
|
const wave = $capCenter.querySelector(".cap-wave");
|
|
742
953
|
if (wave) {
|
|
743
954
|
const bars = wave.children;
|
|
@@ -781,17 +992,20 @@
|
|
|
781
992
|
} catch {}
|
|
782
993
|
return "";
|
|
783
994
|
}
|
|
784
|
-
|
|
995
|
+
// Decode what's been recorded so far (fired once per speech pause). The
|
|
996
|
+
// result is stashed in pendingUserText and reused by the auto-send on stop,
|
|
997
|
+
// so the same audio is never decoded twice. livePromise lets onstop await an
|
|
998
|
+
// in-flight decode before reading the text.
|
|
999
|
+
function runLivePartial() {
|
|
785
1000
|
if (liveBusy || mode !== "listening" || !recordedChunks.length) return;
|
|
786
1001
|
liveBusy = true;
|
|
787
|
-
|
|
788
|
-
|
|
789
|
-
|
|
790
|
-
pendingUserText = text;
|
|
791
|
-
|
|
792
|
-
|
|
793
|
-
|
|
794
|
-
} finally { liveBusy = false; }
|
|
1002
|
+
livePromise = (async () => {
|
|
1003
|
+
try {
|
|
1004
|
+
const text = await transcribeBuffered();
|
|
1005
|
+
if (text && mode === "listening") pendingUserText = text;
|
|
1006
|
+
} finally { liveBusy = false; }
|
|
1007
|
+
})();
|
|
1008
|
+
return livePromise;
|
|
795
1009
|
}
|
|
796
1010
|
|
|
797
1011
|
// ── Send: text path + post-transcription commit path ─────────────────────
|
|
@@ -819,12 +1033,10 @@
|
|
|
819
1033
|
// one ResizeObserver tick later). Shared by commitUserMessage + regen so
|
|
820
1034
|
// both paths set up the daemon-event pipeline identically.
|
|
821
1035
|
function startAgentTurn() {
|
|
822
|
-
|
|
823
|
-
pendingTtsTurnId = null;
|
|
824
|
-
if (ttsTimer) { clearTimeout(ttsTimer); ttsTimer = null; }
|
|
1036
|
+
beginAgentTurn(); // bump currentTurn + reset the audio queue/guards
|
|
825
1037
|
mode = "thinking";
|
|
826
1038
|
render();
|
|
827
|
-
|
|
1039
|
+
ensureConv(); // segments will mount their own bubbles
|
|
828
1040
|
requestWindowResize();
|
|
829
1041
|
}
|
|
830
1042
|
|
|
@@ -861,73 +1073,88 @@
|
|
|
861
1073
|
window.apx?.onDaemonEvent?.((msg) => {
|
|
862
1074
|
switch (msg.type) {
|
|
863
1075
|
case "thinking":
|
|
864
|
-
|
|
865
|
-
|
|
1076
|
+
// Marks the start of a turn. For locally-initiated turns startAgentTurn
|
|
1077
|
+
// already ran beginAgentTurn() (mode is already "thinking"); for turns
|
|
1078
|
+
// NOT initiated in this window (injected / broadcast from another client)
|
|
1079
|
+
// we set them up here so currentTurn/queue/doneHandled are correct and
|
|
1080
|
+
// the turn doesn't hang.
|
|
1081
|
+
if (mode !== "thinking" && mode !== "speaking") {
|
|
1082
|
+
beginAgentTurn();
|
|
1083
|
+
mode = "thinking";
|
|
1084
|
+
render();
|
|
1085
|
+
} else {
|
|
1086
|
+
doneHandled = false;
|
|
1087
|
+
}
|
|
1088
|
+
ensureConv();
|
|
866
1089
|
break;
|
|
867
1090
|
case "token":
|
|
1091
|
+
// Legacy path (backend no longer streams tokens for desktop). Kept so a
|
|
1092
|
+
// mixed-version daemon doesn't break — accumulate into a single bubble.
|
|
868
1093
|
appendStreamingToken(msg.text || "");
|
|
869
1094
|
break;
|
|
870
1095
|
case "tool_start": addToolPill(msg.name); break;
|
|
871
1096
|
case "tool_done": updateToolPill(msg.name); break;
|
|
1097
|
+
case "segment": {
|
|
1098
|
+
// Each segment is its own agent message bubble + its own audio.
|
|
1099
|
+
ensureConv();
|
|
1100
|
+
const text = (msg.text || "").trim();
|
|
1101
|
+
if (!text) break;
|
|
1102
|
+
const id = nextId++;
|
|
1103
|
+
const m = { id, seq: msg.seq || 0, turn: currentTurn, role: "agent", text, t: nowHHMM(), audio: null, dur: null };
|
|
1104
|
+
messages.push(m);
|
|
1105
|
+
appendTurn(m, true);
|
|
1106
|
+
queueRegisterSegment(m);
|
|
1107
|
+
// Synthesize THIS segment; tts-ready(seg=id) attaches its audio + queues
|
|
1108
|
+
// it for gapless sequential playback.
|
|
1109
|
+
window.apx?.requestTts?.(text, id);
|
|
1110
|
+
requestWindowResize();
|
|
1111
|
+
scrollConvToBottom();
|
|
1112
|
+
break;
|
|
1113
|
+
}
|
|
872
1114
|
case "done": {
|
|
873
|
-
// Daemon may emit `done` twice (retry/race). Process only once per turn.
|
|
874
1115
|
if (doneHandled) break;
|
|
875
1116
|
doneHandled = true;
|
|
876
|
-
|
|
877
|
-
//
|
|
878
|
-
|
|
879
|
-
|
|
880
|
-
//
|
|
881
|
-
//
|
|
882
|
-
if (
|
|
883
|
-
|
|
884
|
-
if (!
|
|
885
|
-
|
|
886
|
-
|
|
887
|
-
|
|
888
|
-
|
|
889
|
-
|
|
890
|
-
|
|
891
|
-
|
|
892
|
-
// the already-rendered turn (see attachAudioToLastAgentTurn below).
|
|
893
|
-
const finalizedTurnId = streamingAgentEntry?.id;
|
|
894
|
-
finalizeStreamingAgent();
|
|
895
|
-
mode = "idle"; render();
|
|
896
|
-
// Fire-and-forget TTS request. If it returns audio, attach it to
|
|
897
|
-
// the turn we just rendered; if it errors / times out / never replies,
|
|
898
|
-
// no big deal — the user already has the text. Guard with a 6s soft
|
|
899
|
-
// timeout so a stuck request doesn't hold ttsTimer state.
|
|
900
|
-
const handled = window.apx?.requestTts?.(finalText);
|
|
901
|
-
if (handled) {
|
|
902
|
-
if (ttsTimer) clearTimeout(ttsTimer);
|
|
903
|
-
ttsTimer = setTimeout(() => { ttsTimer = null; }, 6000);
|
|
904
|
-
// Remember which turn the next tts-ready/failed belongs to.
|
|
905
|
-
pendingTtsTurnId = finalizedTurnId || null;
|
|
1117
|
+
turnDone = true;
|
|
1118
|
+
// Record the whole turn as one assistant entry for conversation context.
|
|
1119
|
+
const full = (msg.text || "").trim();
|
|
1120
|
+
if (full) history.push({ role: "assistant", content: full });
|
|
1121
|
+
// Safety net: if some segment's TTS never resolves, flush after 12s so
|
|
1122
|
+
// the capsule can't get stuck in "Pensando…".
|
|
1123
|
+
if (turnWatchdog) clearTimeout(turnWatchdog);
|
|
1124
|
+
turnWatchdog = setTimeout(() => {
|
|
1125
|
+
turnAudios.forEach((e) => { if (!e.ready) { e.ready = true; e.failed = true; e.played = true; } });
|
|
1126
|
+
pumpAudioQueue();
|
|
1127
|
+
}, 12000);
|
|
1128
|
+
// Play whatever audio is already ready; flip to idle if there's nothing
|
|
1129
|
+
// left to play (e.g. a turn that produced no audio).
|
|
1130
|
+
pumpAudioQueue();
|
|
1131
|
+
if (!queuePlaying && audioCursor >= turnAudios.length && mode !== "speaking") {
|
|
1132
|
+
mode = "idle"; render();
|
|
906
1133
|
}
|
|
907
1134
|
break;
|
|
908
1135
|
}
|
|
909
|
-
case "tts-ready":
|
|
910
|
-
if (
|
|
911
|
-
if (pendingTtsTurnId != null) {
|
|
912
|
-
attachAudioToTurn(pendingTtsTurnId, { url: msg.url, dur: msg.duration });
|
|
913
|
-
pendingTtsTurnId = null;
|
|
914
|
-
}
|
|
1136
|
+
case "tts-ready":
|
|
1137
|
+
if (msg.seg != null) attachAudioToTurn(msg.seg, { url: msg.url, dur: msg.duration });
|
|
915
1138
|
break;
|
|
916
|
-
|
|
917
|
-
|
|
918
|
-
|
|
919
|
-
if (
|
|
920
|
-
pendingTtsTurnId = null;
|
|
1139
|
+
case "tts-failed": {
|
|
1140
|
+
// No audio for this segment — skip it in the queue so playback advances.
|
|
1141
|
+
const m = (msg.seg != null) ? messages.find((x) => x.id === msg.seg) : null;
|
|
1142
|
+
if (m) queueMarkFailed(m);
|
|
921
1143
|
break;
|
|
922
|
-
|
|
923
|
-
|
|
1144
|
+
}
|
|
1145
|
+
case "error": {
|
|
1146
|
+
ensureConv();
|
|
1147
|
+
const id = nextId++;
|
|
1148
|
+
const m = { id, seq: 9999, turn: currentTurn, role: "agent", text: "Error: " + (msg.message || "Unknown error"), t: nowHHMM(), isError: true };
|
|
1149
|
+
messages.push(m);
|
|
1150
|
+
appendTurn(m, true);
|
|
1151
|
+
turnDone = true;
|
|
1152
|
+
if (mode !== "speaking") { mode = "idle"; render(); }
|
|
924
1153
|
break;
|
|
1154
|
+
}
|
|
925
1155
|
case "cancelled":
|
|
926
|
-
|
|
927
|
-
|
|
928
|
-
else finalizeStreamingAgent();
|
|
929
|
-
streamingAgentEntry = null;
|
|
930
|
-
}
|
|
1156
|
+
resetTurnAudio();
|
|
1157
|
+
turnDone = true;
|
|
931
1158
|
mode = "idle"; render();
|
|
932
1159
|
break;
|
|
933
1160
|
}
|
|
@@ -949,8 +1176,20 @@
|
|
|
949
1176
|
document.addEventListener("keydown", (e) => {
|
|
950
1177
|
if (e.key === "Escape") {
|
|
951
1178
|
e.preventDefault();
|
|
952
|
-
|
|
953
|
-
|
|
1179
|
+
// Escape cancels whatever is in flight (recording / transcribing /
|
|
1180
|
+
// thinking / speaking). If nothing is in flight, a half-typed draft is
|
|
1181
|
+
// cleared first; only an empty idle capsule closes the window.
|
|
1182
|
+
if (mode === "listening" || mode === "transcribing" || mode === "thinking" || mode === "speaking") {
|
|
1183
|
+
cancel();
|
|
1184
|
+
return;
|
|
1185
|
+
}
|
|
1186
|
+
const input = $capCenter.querySelector("input");
|
|
1187
|
+
if (input && input.value.trim()) {
|
|
1188
|
+
input.value = "";
|
|
1189
|
+
render();
|
|
1190
|
+
} else {
|
|
1191
|
+
closeWindow();
|
|
1192
|
+
}
|
|
954
1193
|
}
|
|
955
1194
|
});
|
|
956
1195
|
|
|
@@ -975,6 +1214,13 @@
|
|
|
975
1214
|
setInterval(requestWindowResize, 250);
|
|
976
1215
|
}
|
|
977
1216
|
|
|
1217
|
+
// ── Keep STT warm ────────────────────────────────────────────────────────
|
|
1218
|
+
// The whisper server idles out after ~10 min. While the desktop window is
|
|
1219
|
+
// running we ping it every 4 min (and once now) so it stays loaded — the
|
|
1220
|
+
// user's first utterance never pays the cold-load cost.
|
|
1221
|
+
window.apx?.warmupStt?.();
|
|
1222
|
+
setInterval(() => { window.apx?.warmupStt?.(); }, 4 * 60 * 1000);
|
|
1223
|
+
|
|
978
1224
|
// ── Helpers ──────────────────────────────────────────────────────────────
|
|
979
1225
|
function nowHHMM() {
|
|
980
1226
|
const d = new Date();
|
|
@@ -328,19 +328,30 @@ button { font-family: inherit; }
|
|
|
328
328
|
.wavebar i.cur { transform: scaleY(1.25); }
|
|
329
329
|
.audio .dur { font-size: 11px; color: var(--ink-3); font-variant-numeric: tabular-nums; min-width: 30px; text-align: right; }
|
|
330
330
|
|
|
331
|
-
/*
|
|
331
|
+
/* Continued agent messages (no repeated "Roby" header) hug the previous one. */
|
|
332
|
+
.turn.cont { padding-top: 0; }
|
|
333
|
+
.turn.cont .msg-agent { margin-top: 1px; }
|
|
334
|
+
|
|
335
|
+
/* Inline copy icon at the end of an agent message — hover-only, so it never
|
|
336
|
+
reserves an empty row of vertical space. */
|
|
337
|
+
.msg-agent .btn-copy {
|
|
338
|
+
display: inline-flex; align-items: center; vertical-align: -3px;
|
|
339
|
+
margin-left: 6px; padding: 1px; border: none; background: transparent;
|
|
340
|
+
cursor: pointer; color: var(--ink-3); opacity: 0; transition: opacity .15s ease, color .15s ease;
|
|
341
|
+
}
|
|
342
|
+
.turn:hover .msg-agent .btn-copy { opacity: .55; }
|
|
343
|
+
.msg-agent .btn-copy:hover { opacity: 1; color: var(--ink); }
|
|
344
|
+
.msg-agent .btn-copy.done { opacity: 1; color: oklch(0.6 0.15 150); }
|
|
345
|
+
|
|
346
|
+
/* per-turn actions — only Regenerate now, and only on the LAST agent turn.
|
|
347
|
+
Regenerating a past turn would replay the most-recent user prompt (not the
|
|
348
|
+
one that produced that reply) and silently break the flow, so it's hidden
|
|
349
|
+
everywhere else and takes no space. */
|
|
332
350
|
.turn-actions {
|
|
333
|
-
margin: 7px 0 0 24px; display:
|
|
351
|
+
margin: 7px 0 0 24px; display: none; gap: 4px;
|
|
334
352
|
opacity: 0; transition: opacity .2s ease;
|
|
335
353
|
}
|
|
336
|
-
.turn
|
|
337
|
-
|
|
338
|
-
/* Regenerate is only meaningful on the LAST agent turn — regenerating a
|
|
339
|
-
past one would replay the most-recent user prompt (not the one that
|
|
340
|
-
produced this reply) and silently break the conversation flow. Copy
|
|
341
|
-
stays available on every turn so users can grab old replies. */
|
|
342
|
-
.turn .btn-regen { display: none; }
|
|
343
|
-
.turn.last .btn-regen { display: inline-flex; }
|
|
354
|
+
.turn.last .turn-actions { display: flex; opacity: 1; }
|
|
344
355
|
.chip {
|
|
345
356
|
display: inline-flex; align-items: center; gap: 5px; cursor: pointer;
|
|
346
357
|
padding: 4px 8px; border-radius: 9px; border: 1px solid var(--glass-hairline);
|