npm - @livx.cc/agentx - Versions diffs - 0.97.9 → 0.98.2 - Mend

@livx.cc/agentx 0.97.9 → 0.98.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

package/dist/index.d.ts CHANGED Viewed

@@ -894,6 +894,9 @@ declare class DuplexAgentOptions {
     /** Voice register: 'neutral' = clean spoken style; 'conversational' = human-like — fillers,
      *  backchannels, impulsive first reactions before content (mimics real duplex conversation). */
     voiceStyle: 'neutral' | 'conversational';
+    /** Teach the model to emit inline `[emotion]` tags for Cartesia emotion control. Only set when the
+     *  TTS actually speaks them — text-duplex (no TTS) would otherwise print literal tags. */
+    emotionTags: boolean;
     /** Awaited BEFORE a worker spawns — open a per-task checkpoint frame, audit, etc.
      *  (post-spawn would race the worker's first edits). */
     onTaskStart?: (id: string, label: string) => void | Promise<void>;
@@ -949,6 +952,7 @@ declare class DuplexAgent {
     private turnDispatched;
     private turnBriefs;
     private spokeThisTurn;
+    private heldThisTurn;
     private nudging;
     private reflexBuf;
     private reflexForwarded;
@@ -1275,6 +1279,11 @@ declare class VoiceEngineOptions {
      *  speech at all, an audible hiccup. Default OFF: the genuine-gated STT partial is the
      *  mechanism-correct pause trigger; enable only if barge-in onset feels sluggish in a clean-AEC room. */
     overlapEnergyHold: boolean;
+    /** Map inline `[emotion]` tags (emitted by the model, prompt-taught) into Cartesia inline emotion
+     *  tags in the spoken transcript (sonic-3 stitches the prosody). false = strip them silently. */
+    emotions: boolean;
+    /** Show the `[emotion]` tags in the on-screen echo (debug). false = hide (spoken-only). */
+    showEmotions: boolean;
 }
 declare class VoiceEngine {
     options: VoiceEngineOptions;
@@ -1305,11 +1314,14 @@ declare class VoiceEngine {
     private resumeTimer;
     private turnStartAt;
     private uttQueue;
+    private emo;
     constructor(options?: Partial<VoiceEngineOptions>);
     start(): Promise<void>;
     get usingAec(): boolean;
     /** Flip barge-in at runtime (e.g. the mic fell back to non-VPIO → go half-duplex so echo can't leak). */
     setBargeIn(on: boolean): void;
+    /** Show/hide the `[emotion]` debug tags in the echo (next turn's stream picks it up). */
+    setShowEmotions(on: boolean): void;
     private idleWaiters;
     private setState;
     /** Resolve when the engine is no longer speaking (immediate if already idle). */
@@ -1318,7 +1330,9 @@ declare class VoiceEngine {
      *  `ack` speaks the configured micro-ack as the context opener (utterance path only —
      *  masks LLM TTFT; re-voice turns begun by their first delta skip it). */
     beginSpeech(ack?: boolean): void;
-    speakDelta(text: string): void;
+    /** Feed a spoken delta. Returns the on-screen echo text (emotion tags shown/hidden per config) so the
+     *  host renders the SAME stream that was parsed for TTS — no second, state-doubling parse. */
+    speakDelta(text: string): string;
     /** close the spoken turn (idempotent); stays audible until ALL audio arrived AND playback drains */
     endSpeech(): void;
     /** text of the reply cut by the last barge-in — consumed by the host to tell the model what

package/dist/index.js CHANGED Viewed

@@ -4648,6 +4648,168 @@ function digestRun(messages, maxChars) {
 import { MemFilesystem as MemFilesystem2 } from "@livx.cc/wcli/core";
 init_logging();
+// src/voice/emotion.ts
+init_logging();
+var log9 = forComponent("Emotion");
+var EMOTIONS = [
+  // primary (best results)
+  "neutral",
+  "angry",
+  "excited",
+  "content",
+  "sad",
+  "scared",
+  // extended
+  "happy",
+  "enthusiastic",
+  "elated",
+  "triumphant",
+  "amazed",
+  "surprised",
+  "flirtatious",
+  "curious",
+  "calm",
+  "grateful",
+  "affectionate",
+  "sympathetic",
+  "mysterious",
+  "frustrated",
+  "disgusted",
+  "sarcastic",
+  "disappointed",
+  "hurt",
+  "guilty",
+  "bored",
+  "tired",
+  "nostalgic",
+  "apologetic",
+  "hesitant",
+  "confused",
+  "anxious",
+  "panicked",
+  "proud",
+  "confident",
+  "skeptical",
+  "contemplative",
+  "determined"
+];
+var VALID = new Set(EMOTIONS);
+var ALIASES = {
+  cheerful: "happy",
+  joyful: "happy",
+  joy: "happy",
+  glad: "happy",
+  pleased: "happy",
+  warm: "affectionate",
+  thrilled: "excited",
+  eager: "enthusiastic",
+  ecstatic: "elated",
+  euphoric: "elated",
+  mad: "angry",
+  furious: "angry",
+  annoyed: "frustrated",
+  irritated: "frustrated",
+  agitated: "frustrated",
+  shocked: "surprised",
+  astonished: "amazed",
+  wonder: "amazed",
+  worried: "anxious",
+  nervous: "anxious",
+  afraid: "scared",
+  alarmed: "panicked",
+  unsure: "hesitant",
+  uncertain: "hesitant",
+  doubtful: "skeptical",
+  suspicious: "skeptical",
+  thoughtful: "contemplative",
+  focused: "determined",
+  serious: "determined",
+  playful: "flirtatious",
+  teasing: "flirtatious",
+  ironic: "sarcastic",
+  cheeky: "sarcastic",
+  thankful: "grateful",
+  sorry: "apologetic",
+  down: "sad",
+  melancholic: "sad",
+  gloomy: "sad",
+  peaceful: "calm",
+  serene: "calm",
+  relaxed: "calm",
+  sleepy: "tired"
+};
+var NONVERBAL = { laughter: "laughter", laughs: "laughter", laugh: "laughter", laughing: "laughter" };
+function normalizeEmotion(raw) {
+  const k = raw.trim().toLowerCase();
+  if (VALID.has(k)) return k;
+  return ALIASES[k] ?? null;
+}
+function resolveTag(raw) {
+  const k = raw.trim().toLowerCase();
+  if (NONVERBAL[k]) return { kind: "nonverbal", value: NONVERBAL[k] };
+  const e = normalizeEmotion(k);
+  return e ? { kind: "emotion", value: e } : null;
+}
+var TAG_RE = /\[([a-zA-Z][a-zA-Z ]{0,24})\]/g;
+var PARTIAL_RE = /\[[a-zA-Z ]*$/;
+var cartesiaTag = (t) => t.kind === "nonverbal" ? `[${t.value}]` : `<emotion value="${t.value}"/>`;
+var EmotionStream = class {
+  constructor(show = true) {
+    this.show = show;
+  }
+  show;
+  buf = "";
+  pending = null;
+  feed(delta) {
+    this.buf += delta;
+    return this.drain(false);
+  }
+  flush() {
+    return this.drain(true);
+  }
+  drain(final) {
+    let body = this.buf;
+    if (!final) {
+      const p = body.match(PARTIAL_RE);
+      if (p) {
+        this.buf = p[0];
+        body = body.slice(0, body.length - p[0].length);
+      } else this.buf = "";
+    } else this.buf = "";
+    let speech = "", display = "", prose = "", last = 0;
+    TAG_RE.lastIndex = 0;
+    for (let m = TAG_RE.exec(body); m; m = TAG_RE.exec(body)) {
+      this.emit(body.slice(last, m.index), (s, d, p) => {
+        speech += s;
+        display += d;
+        prose += p;
+      });
+      const tag = resolveTag(m[1]);
+      if (tag) {
+        this.pending = tag;
+        if (this.show) display += m[0];
+      } else log9.debug(`dropping unknown emotion tag ${m[0]}`);
+      last = m.index + m[0].length;
+    }
+    this.emit(body.slice(last), (s, d, p) => {
+      speech += s;
+      display += d;
+      prose += p;
+    });
+    return { speech, display, prose };
+  }
+  /** Emit a prose span, flushing any pending tag onto its FRONT (only once real words appear). */
+  emit(text, sink) {
+    if (!text) return;
+    let speech = text;
+    if (this.pending && /[\p{L}\p{N}]/u.test(text)) {
+      speech = cartesiaTag(this.pending) + text;
+      this.pending = null;
+    }
+    sink(speech, text, text);
+  }
+};
 // src/voice/spokenSplitter.ts
 var OPEN = "<spoken>";
 var CLOSE = "</spoken>";
@@ -4721,7 +4883,7 @@ var SpokenSplitter = class {
 };
 // src/duplex.ts
-var log9 = forComponent("DuplexAgent");
+var log10 = forComponent("DuplexAgent");
 function describeCall(call) {
   const v = call.args && Object.values(call.args).find((x) => typeof x === "string" && x.trim());
   const hint = v ? ` (${String(v).replace(/\s+/g, " ").trim().slice(0, 48)})` : "";
@@ -4760,6 +4922,9 @@ var DuplexAgentOptions = class {
   /** Voice register: 'neutral' = clean spoken style; 'conversational' = human-like — fillers,
    *  backchannels, impulsive first reactions before content (mimics real duplex conversation). */
   voiceStyle = "neutral";
+  /** Teach the model to emit inline `[emotion]` tags for Cartesia emotion control. Only set when the
+   *  TTS actually speaks them — text-duplex (no TTS) would otherwise print literal tags. */
+  emotionTags = false;
   /** Awaited BEFORE a worker spawns — open a per-task checkpoint frame, audit, etc.
    *  (post-spawn would race the worker's first edits). */
   onTaskStart;
@@ -4792,6 +4957,7 @@ var VOICE_SYSTEM_PROMPT = 'You are a spoken voice assistant \u2014 the user HEAR
 var THINK_GUIDANCE = "\u2022 `Think` \u2014 your brain. A premium reasoning model, FAR more expensive than Act. Reserve it for open-ended architecture/design questions, or a problem Act already FAILED at. ALL implementation work \u2014 coding, refactoring, debugging, edge cases, tests \u2014 goes to Act; Act is highly capable. Never send the same work to both.";
 var THINK_DISABLED_GUIDANCE = "(Think tier is not available \u2014 use Act for all escalations.)";
 var VOICE_STYLE_CONVERSATIONAL = `Speak like a person in a live conversation, not an assistant reading a script. React first, then deliver: a quick impulsive beat ("oh nice", "hmm, hold on", "ah, got it") before the substance. Use contractions always. Vary sentence length \u2014 some very short. Light fillers and backchannels are fine ("mm-hm", "right", "let's see") but at most one per reply \u2014 never stack them. When you escalate to Act or Think, say it like a human would ("hang on, let me actually dig into that \u2014 gimme a minute") instead of announcing a task. When a result comes back, react to it like you just found out ("okay so \u2014 turns out\u2026"). Match the user's energy: a quick question gets a quick answer \u2014 a few words is a perfectly good turn. Prefer a short answer plus an offer ("want the details?") over covering everything. Never narrate your own mechanics (no "I will now act", no task ids out loud).`;
+var EMOTION_TAGS_GUIDANCE = `EMOTION: your voice is synthesized with emotion control. Prefix a sentence with an inline [emotion] tag, placed directly before the sentence it colors, to shape how it is spoken. Use it ONLY when the emotion genuinely fits the words (it amplifies real feeling, it cannot fake it) \u2014 do not tag every sentence; reserve it for moments that carry feeling, and vary which one you use. You may also drop [laughter] for a natural laugh. Available emotions: ${EMOTIONS.join(", ")}.`;
 var DuplexAgent = class _DuplexAgent {
   options;
   voice;
@@ -4817,6 +4983,8 @@ var DuplexAgent = class _DuplexAgent {
   // briefs dispatched this turn (detect identical re-dispatch)
   spokeThisTurn = false;
   // any non-empty text_delta streamed this turn
+  heldThisTurn = false;
+  // Hold called this turn → turn is INTENTIONALLY silent (suppress reflex text + no dead-air ack)
   nudging = false;
   // re-ack pass in flight: block ALL tools, prevent recursion
   reflexBuf = "";
@@ -4861,7 +5029,7 @@ var DuplexAgent = class _DuplexAgent {
       ...new Set(workerToolNames.filter((n) => n.startsWith("mcp__")).map((n) => n.slice(5).split("__")[0]))
     ];
     const workerMcp = mcpNames.length ? `, and it can use these MCP servers: ${[...new Set(mcpNames)].join(", ")}` + (mcpNames.some((n) => /browser/i.test(n)) ? ' \u2014 including driving a REAL browser (open tabs, navigate, click, screenshot), so answer "yes" if asked whether you can control/drive a browser and route an actual browse to Act' : "") : "";
-    const prompt = VOICE_SYSTEM_PROMPT.replace("{{MEMORY_SLOT}}", memSlot).replace("{{THINK_SLOT}}", thinkSlot).replace("{{WORKER_WEB}}", workerWeb + workerMcp) + (o.voiceStyle === "conversational" ? "\n" + VOICE_STYLE_CONVERSATIONAL : "") + `
+    const prompt = VOICE_SYSTEM_PROMPT.replace("{{MEMORY_SLOT}}", memSlot).replace("{{THINK_SLOT}}", thinkSlot).replace("{{WORKER_WEB}}", workerWeb + workerMcp) + (o.voiceStyle === "conversational" ? "\n" + VOICE_STYLE_CONVERSATIONAL : "") + (o.emotionTags ? "\n" + EMOTION_TAGS_GUIDANCE : "") + `
 Today's date: ${(/* @__PURE__ */ new Date()).toDateString()}.`;
     const tools = [
       ...o.reflexOptions?.tools ?? [],
@@ -4879,13 +5047,14 @@ Today's date: ${(/* @__PURE__ */ new Date()).toDateString()}.`;
       confirm: host.confirm ? (p, m) => host.confirm(p, m) : void 0,
       notify: (ev) => {
         if (ev?.kind === "text_delta" && typeof ev.message === "string") {
+          if (this.heldThisTurn) return;
           if (this.fabricationCut) return;
           const msg = ev.message;
           this.reflexBuf += msg;
           const m = this.reflexBuf.match(RESERVED_EVENT_MARKER) ?? this.reflexBuf.match(RESERVED_EVENT_OPENER);
           if (m) {
             this.fabricationCut = true;
-            log9.warn(`reflex fabricated a [task \u2026] event in its spoken stream \u2014 cutting it (kept ${m.index} chars)`);
+            log10.warn(`reflex fabricated a [task \u2026] event in its spoken stream \u2014 cutting it (kept ${m.index} chars)`);
             const safe = this.reflexBuf.slice(this.reflexForwarded, m.index);
             if (!safe) return;
             if (safe.trim()) this.spokeThisTurn = true;
@@ -4948,6 +5117,7 @@ Today's date: ${(/* @__PURE__ */ new Date()).toDateString()}.`;
     this.turnDispatched = false;
     this.turnBriefs.clear();
     this.spokeThisTurn = false;
+    this.heldThisTurn = false;
     this.reflexBuf = "";
     this.reflexForwarded = 0;
     this.fabricationCut = false;
@@ -4976,7 +5146,7 @@ Today's date: ${(/* @__PURE__ */ new Date()).toDateString()}.`;
    *  voice) and emits an empty `final`, so no text_delta ever streams. Both ship silence; both repair.
    *  Requires a host: without one there's no stream to detect speech on (and no one to speak to). */
   get silentTurn() {
-    return !!this.options.host && !this.spokeThisTurn;
+    return !!this.options.host && !this.spokeThisTurn && !this.heldThisTurn;
   }
   /** A turn that voiced nothing is dead air. Re-prompt the reflex ONCE so the LLM itself voices a short
    *  line (no template). If it STILL says nothing, fall back to a minimal line so silence never ships.
@@ -4987,7 +5157,7 @@ Today's date: ${(/* @__PURE__ */ new Date()).toDateString()}.`;
     try {
       await this.voice.send(fallback ? "[reminder] You said nothing to the user this turn. Tell them, in ONE short spoken sentence, what just happened \u2014 no tools." : dispatched ? "[reminder] You dispatched a task but said nothing to the user. Say ONE short spoken acknowledgement now \u2014 no tools." : "[reminder] You said nothing to the user this turn. Give your ONE short spoken reply now \u2014 no tools.");
     } catch (e) {
-      log9.warn(`ack nudge failed: ${e instanceof Error ? e.message : e}`);
+      log10.warn(`ack nudge failed: ${e instanceof Error ? e.message : e}`);
     } finally {
       this.nudging = false;
     }
@@ -5079,7 +5249,7 @@ Today's date: ${(/* @__PURE__ */ new Date()).toDateString()}.`;
   buildBrief(brief, tier = "act", deliver = true) {
     const recent = this.voice.transcript.filter((m) => (m.role === "user" || m.role === "assistant") && contentText(m.content).trim()).slice(-this.options.excerptTurns).map((m) => `${m.role}: ${contentText(m.content)}`).join("\n");
     const verify = tier === "act" ? "\n\nBefore reporting done: re-read what you changed and check it against EVERY requirement above \u2014 fix any gap first. Your report is trusted without review." : "";
-    const deliverContract = deliver ? "\n\n## DELIVER (spoken delivery)\nYou are reporting back to a user who is LISTENING. Stream your work normally \u2014 your prose is the written work record and detail, and is NOT spoken. Wrap anything the user should HEAR in <spoken>\u2026</spoken> tags. LEAD WITH the actual content they asked for: if they asked for a specific piece of content \u2014 a value, a name, the actual lines, the writing itself \u2014 that content goes INSIDE the <spoken> tags, not a remark about it. Your FIRST <spoken> segment is substantive \u2014 never a greeting or an acknowledgement (the front-end has already acked; do not double-ack). Keep spoken text concise and natural for the ear: short sentences, no markdown." : "";
+    const deliverContract = deliver ? "\n\n## DELIVER (spoken delivery)\nYou are reporting back to a user who is LISTENING. Stream your work normally \u2014 your prose is the written work record and detail, and is NOT spoken. Wrap anything the user should HEAR in <spoken>\u2026</spoken> tags. LEAD WITH the actual content they asked for: if they asked for a specific piece of content \u2014 a value, a name, the actual lines, the writing itself \u2014 that content goes INSIDE the <spoken> tags, not a remark about it. Your FIRST <spoken> segment is substantive \u2014 never a greeting or an acknowledgement (the front-end has already acked; do not double-ack). Keep spoken text concise and natural for the ear: short sentences, no markdown." + (this.options.emotionTags ? " Inside <spoken>, you may prefix a sentence with an inline [emotion] tag (e.g. [excited], [curious]) to color how it is voiced \u2014 only when it genuinely fits, and vary it; [laughter] gives a natural laugh." : "") : "";
     return (recent ? `${brief}
 ## Recent conversation (for context)
@@ -5195,7 +5365,7 @@ Another agent just implemented the above. Independently check the CURRENT state
     this.notify("task_verify", `task ${id}: verifying`, { id });
     const cres = await new Agent(checkerOpts).run(checkBrief);
     if (cres.finishReason !== "stop") {
-      log9.warn(`task ${id}: verify inconclusive (${cres.finishReason})`);
+      log10.warn(`task ${id}: verify inconclusive (${cres.finishReason})`);
       this.notify("task_verify", `task ${id}: verify inconclusive (${cres.finishReason})`, { id, finishReason: cres.finishReason });
     }
     const sum = (a = 0, b = 0) => a + b;
@@ -5331,7 +5501,7 @@ Another agent just implemented the above. Independently check the CURRENT state
     rec.status = "done";
     rec.result = res.text;
     const incomplete = res.finishReason !== "stop";
-    log9.verbose(`task ${id} done (${res.steps} steps${incomplete ? `, INCOMPLETE: ${res.finishReason}` : ""})`);
+    log10.verbose(`task ${id} done (${res.steps} steps${incomplete ? `, INCOMPLETE: ${res.finishReason}` : ""})`);
     this.notify("task_done", `task ${id} (${rec.label}) completed`, {
       id,
       text: res.text,
@@ -5357,7 +5527,7 @@ Another agent just implemented the above. Independently check the CURRENT state
     this.dropAsk(rec.id);
     rec.status = "error";
     rec.result = msg;
-    log9.warn(`task ${rec.id} failed: ${msg}`);
+    log10.warn(`task ${rec.id} failed: ${msg}`);
     this.notify("task_error", `task ${rec.id} (${rec.label}) failed: ${msg}`);
     this.queueRevoice(this.integrationPrompt(rec, "error", msg, "error"), true);
   }
@@ -5537,6 +5707,7 @@ Another agent just implemented the above. Independently check the CURRENT state
         }
       },
       run: async ({ filler }) => {
+        this.heldThisTurn = true;
         if (filler) this.notify("hold_filler", String(filler));
         return "Holding \u2014 listening for the rest of the user's thought. Do not respond further this turn.";
       }
@@ -5720,7 +5891,7 @@ init_logging();
 // src/voice/engine.ts
 init_logging();
-var log10 = forComponent("VoiceEngine");
+var log11 = forComponent("VoiceEngine");
 var now = () => performance.now();
 var forSpeech = (t) => t.replace(/[*_`#]+/g, "").replace(/^[ \t]*[-•]\s+/gm, "").replace(/\s*[\u2013\u2014]\s*/g, ", ").replace(/[\u2010\u2011]/g, "-").replace(/\s*\|\s*/g, ", ").replace(/(\d)\s+%/g, "$1%").replace(/\.{3,}/g, ".");
 var VoiceEngineOptions = class {
@@ -5788,6 +5959,11 @@ var VoiceEngineOptions = class {
    *  speech at all, an audible hiccup. Default OFF: the genuine-gated STT partial is the
    *  mechanism-correct pause trigger; enable only if barge-in onset feels sluggish in a clean-AEC room. */
   overlapEnergyHold = false;
+  /** Map inline `[emotion]` tags (emitted by the model, prompt-taught) into Cartesia inline emotion
+   *  tags in the spoken transcript (sonic-3 stitches the prosody). false = strip them silently. */
+  emotions = true;
+  /** Show the `[emotion]` tags in the on-screen echo (debug). false = hide (spoken-only). */
+  showEmotions = false;
 };
 var VoiceEngine = class _VoiceEngine {
   options;
@@ -5833,6 +6009,9 @@ var VoiceEngine = class _VoiceEngine {
   // Central speech queue (above the TTS context): complete worker utterances serialize into ONE
   // playback stream, one-at-a-time, never splicing into the live reflex's open utterance.
   uttQueue = [];
+  // Per-turn emotion-tag parser (reset on beginSpeech) — converts `[emotion]` → Cartesia inline tags
+  // for TTS, tracks tag-free prose for echo discrimination, and surfaces display text for the screen.
+  emo = null;
   constructor(options) {
     this.options = { ...new VoiceEngineOptions(), ...options };
     const o = this.options;
@@ -5850,7 +6029,7 @@ var VoiceEngine = class _VoiceEngine {
     this.stt.onLevel = (rms) => this.handleLevel(rms);
     await Promise.all([this.tts.connect(), this.stt.start()]);
     this.setState("listening");
-    log10.debug(`voice I/O up (${this.stt.usingAec ? "AEC" : "heuristic echo"} capture)`);
+    log11.debug(`voice I/O up (${this.stt.usingAec ? "AEC" : "heuristic echo"} capture)`);
   }
   get usingAec() {
     return this.stt.usingAec;
@@ -5859,6 +6038,10 @@ var VoiceEngine = class _VoiceEngine {
   setBargeIn(on) {
     this.options.bargeIn = on;
   }
+  /** Show/hide the `[emotion]` debug tags in the echo (next turn's stream picks it up). */
+  setShowEmotions(on) {
+    this.options.showEmotions = on;
+  }
   idleWaiters = [];
   setState(s) {
     if (this.state === s) return;
@@ -5890,6 +6073,7 @@ var VoiceEngine = class _VoiceEngine {
     this.ctxOpen = true;
     this.spokeDeltas = false;
     this.reply = "";
+    this.emo = this.options.emotions ? new EmotionStream(this.options.showEmotions) : null;
     this.echoWords = new Set(this.words(this.prevReply));
     this.tts.newContext();
     if (ack && this.options.ackPhrase) {
@@ -5900,21 +6084,31 @@ var VoiceEngine = class _VoiceEngine {
     if (!this.turnStartAt) this.turnStartAt = now();
     this.setState("thinking");
   }
+  /** Feed a spoken delta. Returns the on-screen echo text (emotion tags shown/hidden per config) so the
+   *  host renders the SAME stream that was parsed for TTS — no second, state-doubling parse. */
   speakDelta(text) {
-    if (this.interrupted) return;
+    if (this.interrupted) return "";
     if (!this.speaking || !this.ctxOpen) this.beginSpeech();
-    this.reply += text;
+    const { speech, display, prose } = this.emo ? this.emo.feed(text) : { speech: text, display: text, prose: text };
+    this.reply += prose;
     for (const w of this.words(this.reply)) this.echoWords.add(w);
-    this.tts.speak(forSpeech(text), true);
-    if (!this.spokeDeltas && this.turnStartAt) log10.debug(`ttft: ${Math.round(now() - this.turnStartAt)}ms`);
+    this.tts.speak(forSpeech(speech), true);
+    if (!this.spokeDeltas && this.turnStartAt) log11.debug(`ttft: ${Math.round(now() - this.turnStartAt)}ms`);
     this.spokeDeltas = true;
     this.setState("speaking");
+    return display;
   }
   /** close the spoken turn (idempotent); stays audible until ALL audio arrived AND playback drains */
   endSpeech() {
     this.interrupted = false;
     if (!this.speaking) return;
     this.ctxOpen = false;
+    if (this.emo) {
+      const t = this.emo.flush();
+      this.emo = null;
+      if (t.prose) this.reply += t.prose;
+      if (t.speech) this.tts.speak(forSpeech(t.speech), true);
+    }
     if (this.reply) this.prevReply = this.reply;
     const settle = () => {
       if (this.ctxOpen) {
@@ -5927,7 +6121,7 @@ var VoiceEngine = class _VoiceEngine {
       }
       this.drainTimer = null;
       this.speaking = false;
-      if (this.turnStartAt) log10.debug(`turn: ${Math.round(now() - this.turnStartAt)}ms (incl. playback)`);
+      if (this.turnStartAt) log11.debug(`turn: ${Math.round(now() - this.turnStartAt)}ms (incl. playback)`);
       this.echoUntil = now() + 2500;
       if (!this.usingAec) this.stt.reset();
       this.setState("listening");
@@ -6119,7 +6313,7 @@ var VoiceEngine = class _VoiceEngine {
     this.pendingUtt = this.mergeUtterance(this.pendingUtt, text);
     if (this.pendingTimer) clearTimeout(this.pendingTimer);
     if (this.options.incompleteMergeMs && this.looksIncomplete(this.pendingUtt)) {
-      log10.verbose(`hold: incomplete utterance "${this.pendingUtt.slice(-40)}"`);
+      log11.verbose(`hold: incomplete utterance "${this.pendingUtt.slice(-40)}"`);
       this.options.onHold();
       if (this.options.holdFiller && !this.speaking) {
         this.beginSpeech();
@@ -6218,7 +6412,7 @@ async function resolveAuth(auth) {
 }
 // src/voice/soniox.ts
-var log11 = forComponent("SonioxSTT");
+var log12 = forComponent("SonioxSTT");
 var now2 = () => performance.now();
 var SonioxSTTOptions = class {
   auth = "";
@@ -6287,9 +6481,9 @@ var SonioxSTT = class {
     this.ws.onmessage = (ev) => this.handle(JSON.parse(String(ev.data)));
     this.ws.onclose = (ev) => {
       if (this.stopped) return;
-      log11.warn(`soniox ws closed (${ev.code} ${ev.reason || ""}) \u2014 reconnecting`);
+      log12.warn(`soniox ws closed (${ev.code} ${ev.reason || ""}) \u2014 reconnecting`);
       this.reset();
-      this.connectWs().catch((e) => log11.error(`soniox reconnect failed: ${e.message}`));
+      this.connectWs().catch((e) => log12.error(`soniox reconnect failed: ${e.message}`));
     };
   }
   async start() {
@@ -6299,7 +6493,7 @@ var SonioxSTT = class {
     this.endpointTimer = setInterval(() => {
       const combined = (this.finalText + this.partialText).trim();
       if (!combined || now2() - this.lastChangeAt < this.options.silenceEndpointMs) return;
-      if (this.firstTokenAt) log11.debug(`stt: ${Math.round(now2() - this.firstTokenAt)}ms first-token\u2192silence-endpoint, "${combined.slice(0, 60)}"`);
+      if (this.firstTokenAt) log12.debug(`stt: ${Math.round(now2() - this.firstTokenAt)}ms first-token\u2192silence-endpoint, "${combined.slice(0, 60)}"`);
       this.reset();
       this.onUtterance(combined, now2());
     }, 120);
@@ -6311,7 +6505,7 @@ var SonioxSTT = class {
         if (this.stopped) return;
         const ref = this.lastChunkAt || this.startedChunksAt;
         if (now2() - ref > noAudioMs) {
-          log11.error(`stt: no mic audio for >${Math.round(noAudioMs / 1e3)}s \u2014 capture device stopped delivering`);
+          log12.error(`stt: no mic audio for >${Math.round(noAudioMs / 1e3)}s \u2014 capture device stopped delivering`);
           this.onFatal("microphone stopped delivering audio (try a different input device, e.g. AirPods, or check System Settings \u2192 Sound \u2192 Input)");
           this.stop();
         }
@@ -6331,7 +6525,7 @@ var SonioxSTT = class {
     });
   }
   handle(m) {
-    if (m.error_message) return log11.error(`soniox: ${m.error_message}`);
+    if (m.error_message) return log12.error(`soniox: ${m.error_message}`);
     let endpoint = false;
     for (const t of m.tokens ?? []) {
       if (t.text === "<end>") endpoint = true;
@@ -6347,7 +6541,7 @@ var SonioxSTT = class {
     this.onPartial(combined);
     if (endpoint && this.finalText.trim()) {
       const utterance = this.finalText.trim();
-      if (this.firstTokenAt) log11.debug(`stt: ${Math.round(now2() - this.firstTokenAt)}ms first-token\u2192endpoint, "${utterance.slice(0, 60)}"`);
+      if (this.firstTokenAt) log12.debug(`stt: ${Math.round(now2() - this.firstTokenAt)}ms first-token\u2192endpoint, "${utterance.slice(0, 60)}"`);
       this.reset();
       this.onUtterance(utterance, now2());
     }
@@ -6370,7 +6564,7 @@ var SonioxSTT = class {
 // src/voice/cartesia.ts
 init_logging();
-var log12 = forComponent("CartesiaTTS");
+var log13 = forComponent("CartesiaTTS");
 var now3 = () => performance.now();
 var CartesiaTTSOptions = class {
   auth = "";
@@ -6420,9 +6614,9 @@ var CartesiaTTS = class _CartesiaTTS {
       this.ws.onerror = (e) => rej(new Error(`cartesia ws: ${e.message || "connect failed"}`));
     });
     this.ws.onclose = (ev) => {
-      log12.warn(`cartesia ws closed (${ev.code} ${ev.reason || ""})`);
+      log13.warn(`cartesia ws closed (${ev.code} ${ev.reason || ""})`);
       if (!this.closed) {
-        this.connecting = this.doConnect().catch((e) => log12.error(`cartesia reconnect failed: ${e.message}`));
+        this.connecting = this.doConnect().catch((e) => log13.error(`cartesia reconnect failed: ${e.message}`));
       }
     };
     this.ws.onmessage = (ev) => {
@@ -6444,11 +6638,11 @@ var CartesiaTTS = class _CartesiaTTS {
           this.down = true;
           this.downAt = now3();
           this.consecutiveOk = 0;
-          log12.warn(`TTS circuit breaker open \u2014 ${this.consecutiveErrors} consecutive errors, switching to text-only`);
+          log13.warn(`TTS circuit breaker open \u2014 ${this.consecutiveErrors} consecutive errors, switching to text-only`);
           this.onDone();
           this.startProbe();
         } else if (!this.down) {
-          log12.warn(`cartesia: ${JSON.stringify(m)}`);
+          log13.warn(`cartesia: ${JSON.stringify(m)}`);
         }
       }
     };
@@ -6462,7 +6656,7 @@ var CartesiaTTS = class _CartesiaTTS {
     this.consecutiveOk = 0;
     this.stopProbe();
     const downMs = this.downAt ? now3() - this.downAt : 0;
-    (downMs < 2e3 ? log12.debug : log12.info)(`TTS recovered${downMs ? ` (down ${downMs}ms)` : ""}`);
+    (downMs < 2e3 ? log13.debug : log13.info)(`TTS recovered${downMs ? ` (down ${downMs}ms)` : ""}`);
   }
   /** Ensure the WS is open before sending — reconnects if idle-closed. */
   async ensureConnected() {