@livx.cc/agentx 0.97.9 → 0.98.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.ts CHANGED
@@ -894,6 +894,9 @@ declare class DuplexAgentOptions {
894
894
  /** Voice register: 'neutral' = clean spoken style; 'conversational' = human-like — fillers,
895
895
  * backchannels, impulsive first reactions before content (mimics real duplex conversation). */
896
896
  voiceStyle: 'neutral' | 'conversational';
897
+ /** Teach the model to emit inline `[emotion]` tags for Cartesia emotion control. Only set when the
898
+ * TTS actually speaks them — text-duplex (no TTS) would otherwise print literal tags. */
899
+ emotionTags: boolean;
897
900
  /** Awaited BEFORE a worker spawns — open a per-task checkpoint frame, audit, etc.
898
901
  * (post-spawn would race the worker's first edits). */
899
902
  onTaskStart?: (id: string, label: string) => void | Promise<void>;
@@ -1275,6 +1278,11 @@ declare class VoiceEngineOptions {
1275
1278
  * speech at all, an audible hiccup. Default OFF: the genuine-gated STT partial is the
1276
1279
  * mechanism-correct pause trigger; enable only if barge-in onset feels sluggish in a clean-AEC room. */
1277
1280
  overlapEnergyHold: boolean;
1281
+ /** Map inline `[emotion]` tags (emitted by the model, prompt-taught) into Cartesia inline emotion
1282
+ * tags in the spoken transcript (sonic-3 stitches the prosody). false = strip them silently. */
1283
+ emotions: boolean;
1284
+ /** Show the `[emotion]` tags in the on-screen echo (debug). false = hide (spoken-only). */
1285
+ showEmotions: boolean;
1278
1286
  }
1279
1287
  declare class VoiceEngine {
1280
1288
  options: VoiceEngineOptions;
@@ -1305,11 +1313,14 @@ declare class VoiceEngine {
1305
1313
  private resumeTimer;
1306
1314
  private turnStartAt;
1307
1315
  private uttQueue;
1316
+ private emo;
1308
1317
  constructor(options?: Partial<VoiceEngineOptions>);
1309
1318
  start(): Promise<void>;
1310
1319
  get usingAec(): boolean;
1311
1320
  /** Flip barge-in at runtime (e.g. the mic fell back to non-VPIO → go half-duplex so echo can't leak). */
1312
1321
  setBargeIn(on: boolean): void;
1322
+ /** Show/hide the `[emotion]` debug tags in the echo (next turn's stream picks it up). */
1323
+ setShowEmotions(on: boolean): void;
1313
1324
  private idleWaiters;
1314
1325
  private setState;
1315
1326
  /** Resolve when the engine is no longer speaking (immediate if already idle). */
@@ -1318,7 +1329,9 @@ declare class VoiceEngine {
1318
1329
  * `ack` speaks the configured micro-ack as the context opener (utterance path only —
1319
1330
  * masks LLM TTFT; re-voice turns begun by their first delta skip it). */
1320
1331
  beginSpeech(ack?: boolean): void;
1321
- speakDelta(text: string): void;
1332
+ /** Feed a spoken delta. Returns the on-screen echo text (emotion tags shown/hidden per config) so the
1333
+ * host renders the SAME stream that was parsed for TTS — no second, state-doubling parse. */
1334
+ speakDelta(text: string): string;
1322
1335
  /** close the spoken turn (idempotent); stays audible until ALL audio arrived AND playback drains */
1323
1336
  endSpeech(): void;
1324
1337
  /** text of the reply cut by the last barge-in — consumed by the host to tell the model what
package/dist/index.js CHANGED
@@ -4648,6 +4648,168 @@ function digestRun(messages, maxChars) {
4648
4648
  import { MemFilesystem as MemFilesystem2 } from "@livx.cc/wcli/core";
4649
4649
  init_logging();
4650
4650
 
4651
+ // src/voice/emotion.ts
4652
+ init_logging();
4653
+ var log9 = forComponent("Emotion");
4654
+ var EMOTIONS = [
4655
+ // primary (best results)
4656
+ "neutral",
4657
+ "angry",
4658
+ "excited",
4659
+ "content",
4660
+ "sad",
4661
+ "scared",
4662
+ // extended
4663
+ "happy",
4664
+ "enthusiastic",
4665
+ "elated",
4666
+ "triumphant",
4667
+ "amazed",
4668
+ "surprised",
4669
+ "flirtatious",
4670
+ "curious",
4671
+ "calm",
4672
+ "grateful",
4673
+ "affectionate",
4674
+ "sympathetic",
4675
+ "mysterious",
4676
+ "frustrated",
4677
+ "disgusted",
4678
+ "sarcastic",
4679
+ "disappointed",
4680
+ "hurt",
4681
+ "guilty",
4682
+ "bored",
4683
+ "tired",
4684
+ "nostalgic",
4685
+ "apologetic",
4686
+ "hesitant",
4687
+ "confused",
4688
+ "anxious",
4689
+ "panicked",
4690
+ "proud",
4691
+ "confident",
4692
+ "skeptical",
4693
+ "contemplative",
4694
+ "determined"
4695
+ ];
4696
+ var VALID = new Set(EMOTIONS);
4697
+ var ALIASES = {
4698
+ cheerful: "happy",
4699
+ joyful: "happy",
4700
+ joy: "happy",
4701
+ glad: "happy",
4702
+ pleased: "happy",
4703
+ warm: "affectionate",
4704
+ thrilled: "excited",
4705
+ eager: "enthusiastic",
4706
+ ecstatic: "elated",
4707
+ euphoric: "elated",
4708
+ mad: "angry",
4709
+ furious: "angry",
4710
+ annoyed: "frustrated",
4711
+ irritated: "frustrated",
4712
+ agitated: "frustrated",
4713
+ shocked: "surprised",
4714
+ astonished: "amazed",
4715
+ wonder: "amazed",
4716
+ worried: "anxious",
4717
+ nervous: "anxious",
4718
+ afraid: "scared",
4719
+ alarmed: "panicked",
4720
+ unsure: "hesitant",
4721
+ uncertain: "hesitant",
4722
+ doubtful: "skeptical",
4723
+ suspicious: "skeptical",
4724
+ thoughtful: "contemplative",
4725
+ focused: "determined",
4726
+ serious: "determined",
4727
+ playful: "flirtatious",
4728
+ teasing: "flirtatious",
4729
+ ironic: "sarcastic",
4730
+ cheeky: "sarcastic",
4731
+ thankful: "grateful",
4732
+ sorry: "apologetic",
4733
+ down: "sad",
4734
+ melancholic: "sad",
4735
+ gloomy: "sad",
4736
+ peaceful: "calm",
4737
+ serene: "calm",
4738
+ relaxed: "calm",
4739
+ sleepy: "tired"
4740
+ };
4741
+ var NONVERBAL = { laughter: "laughter", laughs: "laughter", laugh: "laughter", laughing: "laughter" };
4742
+ function normalizeEmotion(raw) {
4743
+ const k = raw.trim().toLowerCase();
4744
+ if (VALID.has(k)) return k;
4745
+ return ALIASES[k] ?? null;
4746
+ }
4747
+ function resolveTag(raw) {
4748
+ const k = raw.trim().toLowerCase();
4749
+ if (NONVERBAL[k]) return { kind: "nonverbal", value: NONVERBAL[k] };
4750
+ const e = normalizeEmotion(k);
4751
+ return e ? { kind: "emotion", value: e } : null;
4752
+ }
4753
+ var TAG_RE = /\[([a-zA-Z][a-zA-Z ]{0,24})\]/g;
4754
+ var PARTIAL_RE = /\[[a-zA-Z ]*$/;
4755
+ var cartesiaTag = (t) => t.kind === "nonverbal" ? `[${t.value}]` : `<emotion value="${t.value}"/>`;
4756
+ var EmotionStream = class {
4757
+ constructor(show = true) {
4758
+ this.show = show;
4759
+ }
4760
+ show;
4761
+ buf = "";
4762
+ pending = null;
4763
+ feed(delta) {
4764
+ this.buf += delta;
4765
+ return this.drain(false);
4766
+ }
4767
+ flush() {
4768
+ return this.drain(true);
4769
+ }
4770
+ drain(final) {
4771
+ let body = this.buf;
4772
+ if (!final) {
4773
+ const p = body.match(PARTIAL_RE);
4774
+ if (p) {
4775
+ this.buf = p[0];
4776
+ body = body.slice(0, body.length - p[0].length);
4777
+ } else this.buf = "";
4778
+ } else this.buf = "";
4779
+ let speech = "", display = "", prose = "", last = 0;
4780
+ TAG_RE.lastIndex = 0;
4781
+ for (let m = TAG_RE.exec(body); m; m = TAG_RE.exec(body)) {
4782
+ this.emit(body.slice(last, m.index), (s, d, p) => {
4783
+ speech += s;
4784
+ display += d;
4785
+ prose += p;
4786
+ });
4787
+ const tag = resolveTag(m[1]);
4788
+ if (tag) {
4789
+ this.pending = tag;
4790
+ if (this.show) display += m[0];
4791
+ } else log9.debug(`dropping unknown emotion tag ${m[0]}`);
4792
+ last = m.index + m[0].length;
4793
+ }
4794
+ this.emit(body.slice(last), (s, d, p) => {
4795
+ speech += s;
4796
+ display += d;
4797
+ prose += p;
4798
+ });
4799
+ return { speech, display, prose };
4800
+ }
4801
+ /** Emit a prose span, flushing any pending tag onto its FRONT (only once real words appear). */
4802
+ emit(text, sink) {
4803
+ if (!text) return;
4804
+ let speech = text;
4805
+ if (this.pending && /[\p{L}\p{N}]/u.test(text)) {
4806
+ speech = cartesiaTag(this.pending) + text;
4807
+ this.pending = null;
4808
+ }
4809
+ sink(speech, text, text);
4810
+ }
4811
+ };
4812
+
4651
4813
  // src/voice/spokenSplitter.ts
4652
4814
  var OPEN = "<spoken>";
4653
4815
  var CLOSE = "</spoken>";
@@ -4721,7 +4883,7 @@ var SpokenSplitter = class {
4721
4883
  };
4722
4884
 
4723
4885
  // src/duplex.ts
4724
- var log9 = forComponent("DuplexAgent");
4886
+ var log10 = forComponent("DuplexAgent");
4725
4887
  function describeCall(call) {
4726
4888
  const v = call.args && Object.values(call.args).find((x) => typeof x === "string" && x.trim());
4727
4889
  const hint = v ? ` (${String(v).replace(/\s+/g, " ").trim().slice(0, 48)})` : "";
@@ -4760,6 +4922,9 @@ var DuplexAgentOptions = class {
4760
4922
  /** Voice register: 'neutral' = clean spoken style; 'conversational' = human-like — fillers,
4761
4923
  * backchannels, impulsive first reactions before content (mimics real duplex conversation). */
4762
4924
  voiceStyle = "neutral";
4925
+ /** Teach the model to emit inline `[emotion]` tags for Cartesia emotion control. Only set when the
4926
+ * TTS actually speaks them — text-duplex (no TTS) would otherwise print literal tags. */
4927
+ emotionTags = false;
4763
4928
  /** Awaited BEFORE a worker spawns — open a per-task checkpoint frame, audit, etc.
4764
4929
  * (post-spawn would race the worker's first edits). */
4765
4930
  onTaskStart;
@@ -4792,6 +4957,7 @@ var VOICE_SYSTEM_PROMPT = 'You are a spoken voice assistant \u2014 the user HEAR
4792
4957
  var THINK_GUIDANCE = "\u2022 `Think` \u2014 your brain. A premium reasoning model, FAR more expensive than Act. Reserve it for open-ended architecture/design questions, or a problem Act already FAILED at. ALL implementation work \u2014 coding, refactoring, debugging, edge cases, tests \u2014 goes to Act; Act is highly capable. Never send the same work to both.";
4793
4958
  var THINK_DISABLED_GUIDANCE = "(Think tier is not available \u2014 use Act for all escalations.)";
4794
4959
  var VOICE_STYLE_CONVERSATIONAL = `Speak like a person in a live conversation, not an assistant reading a script. React first, then deliver: a quick impulsive beat ("oh nice", "hmm, hold on", "ah, got it") before the substance. Use contractions always. Vary sentence length \u2014 some very short. Light fillers and backchannels are fine ("mm-hm", "right", "let's see") but at most one per reply \u2014 never stack them. When you escalate to Act or Think, say it like a human would ("hang on, let me actually dig into that \u2014 gimme a minute") instead of announcing a task. When a result comes back, react to it like you just found out ("okay so \u2014 turns out\u2026"). Match the user's energy: a quick question gets a quick answer \u2014 a few words is a perfectly good turn. Prefer a short answer plus an offer ("want the details?") over covering everything. Never narrate your own mechanics (no "I will now act", no task ids out loud).`;
4960
+ var EMOTION_TAGS_GUIDANCE = `EMOTION: your voice is synthesized with emotion control. Prefix a sentence with an inline [emotion] tag, placed directly before the sentence it colors, to shape how it is spoken. Use it ONLY when the emotion genuinely fits the words (it amplifies real feeling, it cannot fake it) \u2014 do not tag every sentence; reserve it for moments that carry feeling, and vary which one you use. You may also drop [laughter] for a natural laugh. Available emotions: ${EMOTIONS.join(", ")}.`;
4795
4961
  var DuplexAgent = class _DuplexAgent {
4796
4962
  options;
4797
4963
  voice;
@@ -4861,7 +5027,7 @@ var DuplexAgent = class _DuplexAgent {
4861
5027
  ...new Set(workerToolNames.filter((n) => n.startsWith("mcp__")).map((n) => n.slice(5).split("__")[0]))
4862
5028
  ];
4863
5029
  const workerMcp = mcpNames.length ? `, and it can use these MCP servers: ${[...new Set(mcpNames)].join(", ")}` + (mcpNames.some((n) => /browser/i.test(n)) ? ' \u2014 including driving a REAL browser (open tabs, navigate, click, screenshot), so answer "yes" if asked whether you can control/drive a browser and route an actual browse to Act' : "") : "";
4864
- const prompt = VOICE_SYSTEM_PROMPT.replace("{{MEMORY_SLOT}}", memSlot).replace("{{THINK_SLOT}}", thinkSlot).replace("{{WORKER_WEB}}", workerWeb + workerMcp) + (o.voiceStyle === "conversational" ? "\n" + VOICE_STYLE_CONVERSATIONAL : "") + `
5030
+ const prompt = VOICE_SYSTEM_PROMPT.replace("{{MEMORY_SLOT}}", memSlot).replace("{{THINK_SLOT}}", thinkSlot).replace("{{WORKER_WEB}}", workerWeb + workerMcp) + (o.voiceStyle === "conversational" ? "\n" + VOICE_STYLE_CONVERSATIONAL : "") + (o.emotionTags ? "\n" + EMOTION_TAGS_GUIDANCE : "") + `
4865
5031
  Today's date: ${(/* @__PURE__ */ new Date()).toDateString()}.`;
4866
5032
  const tools = [
4867
5033
  ...o.reflexOptions?.tools ?? [],
@@ -4885,7 +5051,7 @@ Today's date: ${(/* @__PURE__ */ new Date()).toDateString()}.`;
4885
5051
  const m = this.reflexBuf.match(RESERVED_EVENT_MARKER) ?? this.reflexBuf.match(RESERVED_EVENT_OPENER);
4886
5052
  if (m) {
4887
5053
  this.fabricationCut = true;
4888
- log9.warn(`reflex fabricated a [task \u2026] event in its spoken stream \u2014 cutting it (kept ${m.index} chars)`);
5054
+ log10.warn(`reflex fabricated a [task \u2026] event in its spoken stream \u2014 cutting it (kept ${m.index} chars)`);
4889
5055
  const safe = this.reflexBuf.slice(this.reflexForwarded, m.index);
4890
5056
  if (!safe) return;
4891
5057
  if (safe.trim()) this.spokeThisTurn = true;
@@ -4987,7 +5153,7 @@ Today's date: ${(/* @__PURE__ */ new Date()).toDateString()}.`;
4987
5153
  try {
4988
5154
  await this.voice.send(fallback ? "[reminder] You said nothing to the user this turn. Tell them, in ONE short spoken sentence, what just happened \u2014 no tools." : dispatched ? "[reminder] You dispatched a task but said nothing to the user. Say ONE short spoken acknowledgement now \u2014 no tools." : "[reminder] You said nothing to the user this turn. Give your ONE short spoken reply now \u2014 no tools.");
4989
5155
  } catch (e) {
4990
- log9.warn(`ack nudge failed: ${e instanceof Error ? e.message : e}`);
5156
+ log10.warn(`ack nudge failed: ${e instanceof Error ? e.message : e}`);
4991
5157
  } finally {
4992
5158
  this.nudging = false;
4993
5159
  }
@@ -5079,7 +5245,7 @@ Today's date: ${(/* @__PURE__ */ new Date()).toDateString()}.`;
5079
5245
  buildBrief(brief, tier = "act", deliver = true) {
5080
5246
  const recent = this.voice.transcript.filter((m) => (m.role === "user" || m.role === "assistant") && contentText(m.content).trim()).slice(-this.options.excerptTurns).map((m) => `${m.role}: ${contentText(m.content)}`).join("\n");
5081
5247
  const verify = tier === "act" ? "\n\nBefore reporting done: re-read what you changed and check it against EVERY requirement above \u2014 fix any gap first. Your report is trusted without review." : "";
5082
- const deliverContract = deliver ? "\n\n## DELIVER (spoken delivery)\nYou are reporting back to a user who is LISTENING. Stream your work normally \u2014 your prose is the written work record and detail, and is NOT spoken. Wrap anything the user should HEAR in <spoken>\u2026</spoken> tags. LEAD WITH the actual content they asked for: if they asked for a specific piece of content \u2014 a value, a name, the actual lines, the writing itself \u2014 that content goes INSIDE the <spoken> tags, not a remark about it. Your FIRST <spoken> segment is substantive \u2014 never a greeting or an acknowledgement (the front-end has already acked; do not double-ack). Keep spoken text concise and natural for the ear: short sentences, no markdown." : "";
5248
+ const deliverContract = deliver ? "\n\n## DELIVER (spoken delivery)\nYou are reporting back to a user who is LISTENING. Stream your work normally \u2014 your prose is the written work record and detail, and is NOT spoken. Wrap anything the user should HEAR in <spoken>\u2026</spoken> tags. LEAD WITH the actual content they asked for: if they asked for a specific piece of content \u2014 a value, a name, the actual lines, the writing itself \u2014 that content goes INSIDE the <spoken> tags, not a remark about it. Your FIRST <spoken> segment is substantive \u2014 never a greeting or an acknowledgement (the front-end has already acked; do not double-ack). Keep spoken text concise and natural for the ear: short sentences, no markdown." + (this.options.emotionTags ? " Inside <spoken>, you may prefix a sentence with an inline [emotion] tag (e.g. [excited], [curious]) to color how it is voiced \u2014 only when it genuinely fits, and vary it; [laughter] gives a natural laugh." : "") : "";
5083
5249
  return (recent ? `${brief}
5084
5250
 
5085
5251
  ## Recent conversation (for context)
@@ -5195,7 +5361,7 @@ Another agent just implemented the above. Independently check the CURRENT state
5195
5361
  this.notify("task_verify", `task ${id}: verifying`, { id });
5196
5362
  const cres = await new Agent(checkerOpts).run(checkBrief);
5197
5363
  if (cres.finishReason !== "stop") {
5198
- log9.warn(`task ${id}: verify inconclusive (${cres.finishReason})`);
5364
+ log10.warn(`task ${id}: verify inconclusive (${cres.finishReason})`);
5199
5365
  this.notify("task_verify", `task ${id}: verify inconclusive (${cres.finishReason})`, { id, finishReason: cres.finishReason });
5200
5366
  }
5201
5367
  const sum = (a = 0, b = 0) => a + b;
@@ -5331,7 +5497,7 @@ Another agent just implemented the above. Independently check the CURRENT state
5331
5497
  rec.status = "done";
5332
5498
  rec.result = res.text;
5333
5499
  const incomplete = res.finishReason !== "stop";
5334
- log9.verbose(`task ${id} done (${res.steps} steps${incomplete ? `, INCOMPLETE: ${res.finishReason}` : ""})`);
5500
+ log10.verbose(`task ${id} done (${res.steps} steps${incomplete ? `, INCOMPLETE: ${res.finishReason}` : ""})`);
5335
5501
  this.notify("task_done", `task ${id} (${rec.label}) completed`, {
5336
5502
  id,
5337
5503
  text: res.text,
@@ -5357,7 +5523,7 @@ Another agent just implemented the above. Independently check the CURRENT state
5357
5523
  this.dropAsk(rec.id);
5358
5524
  rec.status = "error";
5359
5525
  rec.result = msg;
5360
- log9.warn(`task ${rec.id} failed: ${msg}`);
5526
+ log10.warn(`task ${rec.id} failed: ${msg}`);
5361
5527
  this.notify("task_error", `task ${rec.id} (${rec.label}) failed: ${msg}`);
5362
5528
  this.queueRevoice(this.integrationPrompt(rec, "error", msg, "error"), true);
5363
5529
  }
@@ -5720,7 +5886,7 @@ init_logging();
5720
5886
 
5721
5887
  // src/voice/engine.ts
5722
5888
  init_logging();
5723
- var log10 = forComponent("VoiceEngine");
5889
+ var log11 = forComponent("VoiceEngine");
5724
5890
  var now = () => performance.now();
5725
5891
  var forSpeech = (t) => t.replace(/[*_`#]+/g, "").replace(/^[ \t]*[-•]\s+/gm, "").replace(/\s*[\u2013\u2014]\s*/g, ", ").replace(/[\u2010\u2011]/g, "-").replace(/\s*\|\s*/g, ", ").replace(/(\d)\s+%/g, "$1%").replace(/\.{3,}/g, ".");
5726
5892
  var VoiceEngineOptions = class {
@@ -5788,6 +5954,11 @@ var VoiceEngineOptions = class {
5788
5954
  * speech at all, an audible hiccup. Default OFF: the genuine-gated STT partial is the
5789
5955
  * mechanism-correct pause trigger; enable only if barge-in onset feels sluggish in a clean-AEC room. */
5790
5956
  overlapEnergyHold = false;
5957
+ /** Map inline `[emotion]` tags (emitted by the model, prompt-taught) into Cartesia inline emotion
5958
+ * tags in the spoken transcript (sonic-3 stitches the prosody). false = strip them silently. */
5959
+ emotions = true;
5960
+ /** Show the `[emotion]` tags in the on-screen echo (debug). false = hide (spoken-only). */
5961
+ showEmotions = false;
5791
5962
  };
5792
5963
  var VoiceEngine = class _VoiceEngine {
5793
5964
  options;
@@ -5833,6 +6004,9 @@ var VoiceEngine = class _VoiceEngine {
5833
6004
  // Central speech queue (above the TTS context): complete worker utterances serialize into ONE
5834
6005
  // playback stream, one-at-a-time, never splicing into the live reflex's open utterance.
5835
6006
  uttQueue = [];
6007
+ // Per-turn emotion-tag parser (reset on beginSpeech) — converts `[emotion]` → Cartesia inline tags
6008
+ // for TTS, tracks tag-free prose for echo discrimination, and surfaces display text for the screen.
6009
+ emo = null;
5836
6010
  constructor(options) {
5837
6011
  this.options = { ...new VoiceEngineOptions(), ...options };
5838
6012
  const o = this.options;
@@ -5850,7 +6024,7 @@ var VoiceEngine = class _VoiceEngine {
5850
6024
  this.stt.onLevel = (rms) => this.handleLevel(rms);
5851
6025
  await Promise.all([this.tts.connect(), this.stt.start()]);
5852
6026
  this.setState("listening");
5853
- log10.debug(`voice I/O up (${this.stt.usingAec ? "AEC" : "heuristic echo"} capture)`);
6027
+ log11.debug(`voice I/O up (${this.stt.usingAec ? "AEC" : "heuristic echo"} capture)`);
5854
6028
  }
5855
6029
  get usingAec() {
5856
6030
  return this.stt.usingAec;
@@ -5859,6 +6033,10 @@ var VoiceEngine = class _VoiceEngine {
5859
6033
  setBargeIn(on) {
5860
6034
  this.options.bargeIn = on;
5861
6035
  }
6036
+ /** Show/hide the `[emotion]` debug tags in the echo (next turn's stream picks it up). */
6037
+ setShowEmotions(on) {
6038
+ this.options.showEmotions = on;
6039
+ }
5862
6040
  idleWaiters = [];
5863
6041
  setState(s) {
5864
6042
  if (this.state === s) return;
@@ -5890,6 +6068,7 @@ var VoiceEngine = class _VoiceEngine {
5890
6068
  this.ctxOpen = true;
5891
6069
  this.spokeDeltas = false;
5892
6070
  this.reply = "";
6071
+ this.emo = this.options.emotions ? new EmotionStream(this.options.showEmotions) : null;
5893
6072
  this.echoWords = new Set(this.words(this.prevReply));
5894
6073
  this.tts.newContext();
5895
6074
  if (ack && this.options.ackPhrase) {
@@ -5900,21 +6079,31 @@ var VoiceEngine = class _VoiceEngine {
5900
6079
  if (!this.turnStartAt) this.turnStartAt = now();
5901
6080
  this.setState("thinking");
5902
6081
  }
6082
+ /** Feed a spoken delta. Returns the on-screen echo text (emotion tags shown/hidden per config) so the
6083
+ * host renders the SAME stream that was parsed for TTS — no second, state-doubling parse. */
5903
6084
  speakDelta(text) {
5904
- if (this.interrupted) return;
6085
+ if (this.interrupted) return "";
5905
6086
  if (!this.speaking || !this.ctxOpen) this.beginSpeech();
5906
- this.reply += text;
6087
+ const { speech, display, prose } = this.emo ? this.emo.feed(text) : { speech: text, display: text, prose: text };
6088
+ this.reply += prose;
5907
6089
  for (const w of this.words(this.reply)) this.echoWords.add(w);
5908
- this.tts.speak(forSpeech(text), true);
5909
- if (!this.spokeDeltas && this.turnStartAt) log10.debug(`ttft: ${Math.round(now() - this.turnStartAt)}ms`);
6090
+ this.tts.speak(forSpeech(speech), true);
6091
+ if (!this.spokeDeltas && this.turnStartAt) log11.debug(`ttft: ${Math.round(now() - this.turnStartAt)}ms`);
5910
6092
  this.spokeDeltas = true;
5911
6093
  this.setState("speaking");
6094
+ return display;
5912
6095
  }
5913
6096
  /** close the spoken turn (idempotent); stays audible until ALL audio arrived AND playback drains */
5914
6097
  endSpeech() {
5915
6098
  this.interrupted = false;
5916
6099
  if (!this.speaking) return;
5917
6100
  this.ctxOpen = false;
6101
+ if (this.emo) {
6102
+ const t = this.emo.flush();
6103
+ this.emo = null;
6104
+ if (t.prose) this.reply += t.prose;
6105
+ if (t.speech) this.tts.speak(forSpeech(t.speech), true);
6106
+ }
5918
6107
  if (this.reply) this.prevReply = this.reply;
5919
6108
  const settle = () => {
5920
6109
  if (this.ctxOpen) {
@@ -5927,7 +6116,7 @@ var VoiceEngine = class _VoiceEngine {
5927
6116
  }
5928
6117
  this.drainTimer = null;
5929
6118
  this.speaking = false;
5930
- if (this.turnStartAt) log10.debug(`turn: ${Math.round(now() - this.turnStartAt)}ms (incl. playback)`);
6119
+ if (this.turnStartAt) log11.debug(`turn: ${Math.round(now() - this.turnStartAt)}ms (incl. playback)`);
5931
6120
  this.echoUntil = now() + 2500;
5932
6121
  if (!this.usingAec) this.stt.reset();
5933
6122
  this.setState("listening");
@@ -6119,7 +6308,7 @@ var VoiceEngine = class _VoiceEngine {
6119
6308
  this.pendingUtt = this.mergeUtterance(this.pendingUtt, text);
6120
6309
  if (this.pendingTimer) clearTimeout(this.pendingTimer);
6121
6310
  if (this.options.incompleteMergeMs && this.looksIncomplete(this.pendingUtt)) {
6122
- log10.verbose(`hold: incomplete utterance "${this.pendingUtt.slice(-40)}"`);
6311
+ log11.verbose(`hold: incomplete utterance "${this.pendingUtt.slice(-40)}"`);
6123
6312
  this.options.onHold();
6124
6313
  if (this.options.holdFiller && !this.speaking) {
6125
6314
  this.beginSpeech();
@@ -6218,7 +6407,7 @@ async function resolveAuth(auth) {
6218
6407
  }
6219
6408
 
6220
6409
  // src/voice/soniox.ts
6221
- var log11 = forComponent("SonioxSTT");
6410
+ var log12 = forComponent("SonioxSTT");
6222
6411
  var now2 = () => performance.now();
6223
6412
  var SonioxSTTOptions = class {
6224
6413
  auth = "";
@@ -6287,9 +6476,9 @@ var SonioxSTT = class {
6287
6476
  this.ws.onmessage = (ev) => this.handle(JSON.parse(String(ev.data)));
6288
6477
  this.ws.onclose = (ev) => {
6289
6478
  if (this.stopped) return;
6290
- log11.warn(`soniox ws closed (${ev.code} ${ev.reason || ""}) \u2014 reconnecting`);
6479
+ log12.warn(`soniox ws closed (${ev.code} ${ev.reason || ""}) \u2014 reconnecting`);
6291
6480
  this.reset();
6292
- this.connectWs().catch((e) => log11.error(`soniox reconnect failed: ${e.message}`));
6481
+ this.connectWs().catch((e) => log12.error(`soniox reconnect failed: ${e.message}`));
6293
6482
  };
6294
6483
  }
6295
6484
  async start() {
@@ -6299,7 +6488,7 @@ var SonioxSTT = class {
6299
6488
  this.endpointTimer = setInterval(() => {
6300
6489
  const combined = (this.finalText + this.partialText).trim();
6301
6490
  if (!combined || now2() - this.lastChangeAt < this.options.silenceEndpointMs) return;
6302
- if (this.firstTokenAt) log11.debug(`stt: ${Math.round(now2() - this.firstTokenAt)}ms first-token\u2192silence-endpoint, "${combined.slice(0, 60)}"`);
6491
+ if (this.firstTokenAt) log12.debug(`stt: ${Math.round(now2() - this.firstTokenAt)}ms first-token\u2192silence-endpoint, "${combined.slice(0, 60)}"`);
6303
6492
  this.reset();
6304
6493
  this.onUtterance(combined, now2());
6305
6494
  }, 120);
@@ -6311,7 +6500,7 @@ var SonioxSTT = class {
6311
6500
  if (this.stopped) return;
6312
6501
  const ref = this.lastChunkAt || this.startedChunksAt;
6313
6502
  if (now2() - ref > noAudioMs) {
6314
- log11.error(`stt: no mic audio for >${Math.round(noAudioMs / 1e3)}s \u2014 capture device stopped delivering`);
6503
+ log12.error(`stt: no mic audio for >${Math.round(noAudioMs / 1e3)}s \u2014 capture device stopped delivering`);
6315
6504
  this.onFatal("microphone stopped delivering audio (try a different input device, e.g. AirPods, or check System Settings \u2192 Sound \u2192 Input)");
6316
6505
  this.stop();
6317
6506
  }
@@ -6331,7 +6520,7 @@ var SonioxSTT = class {
6331
6520
  });
6332
6521
  }
6333
6522
  handle(m) {
6334
- if (m.error_message) return log11.error(`soniox: ${m.error_message}`);
6523
+ if (m.error_message) return log12.error(`soniox: ${m.error_message}`);
6335
6524
  let endpoint = false;
6336
6525
  for (const t of m.tokens ?? []) {
6337
6526
  if (t.text === "<end>") endpoint = true;
@@ -6347,7 +6536,7 @@ var SonioxSTT = class {
6347
6536
  this.onPartial(combined);
6348
6537
  if (endpoint && this.finalText.trim()) {
6349
6538
  const utterance = this.finalText.trim();
6350
- if (this.firstTokenAt) log11.debug(`stt: ${Math.round(now2() - this.firstTokenAt)}ms first-token\u2192endpoint, "${utterance.slice(0, 60)}"`);
6539
+ if (this.firstTokenAt) log12.debug(`stt: ${Math.round(now2() - this.firstTokenAt)}ms first-token\u2192endpoint, "${utterance.slice(0, 60)}"`);
6351
6540
  this.reset();
6352
6541
  this.onUtterance(utterance, now2());
6353
6542
  }
@@ -6370,7 +6559,7 @@ var SonioxSTT = class {
6370
6559
 
6371
6560
  // src/voice/cartesia.ts
6372
6561
  init_logging();
6373
- var log12 = forComponent("CartesiaTTS");
6562
+ var log13 = forComponent("CartesiaTTS");
6374
6563
  var now3 = () => performance.now();
6375
6564
  var CartesiaTTSOptions = class {
6376
6565
  auth = "";
@@ -6420,9 +6609,9 @@ var CartesiaTTS = class _CartesiaTTS {
6420
6609
  this.ws.onerror = (e) => rej(new Error(`cartesia ws: ${e.message || "connect failed"}`));
6421
6610
  });
6422
6611
  this.ws.onclose = (ev) => {
6423
- log12.warn(`cartesia ws closed (${ev.code} ${ev.reason || ""})`);
6612
+ log13.warn(`cartesia ws closed (${ev.code} ${ev.reason || ""})`);
6424
6613
  if (!this.closed) {
6425
- this.connecting = this.doConnect().catch((e) => log12.error(`cartesia reconnect failed: ${e.message}`));
6614
+ this.connecting = this.doConnect().catch((e) => log13.error(`cartesia reconnect failed: ${e.message}`));
6426
6615
  }
6427
6616
  };
6428
6617
  this.ws.onmessage = (ev) => {
@@ -6444,11 +6633,11 @@ var CartesiaTTS = class _CartesiaTTS {
6444
6633
  this.down = true;
6445
6634
  this.downAt = now3();
6446
6635
  this.consecutiveOk = 0;
6447
- log12.warn(`TTS circuit breaker open \u2014 ${this.consecutiveErrors} consecutive errors, switching to text-only`);
6636
+ log13.warn(`TTS circuit breaker open \u2014 ${this.consecutiveErrors} consecutive errors, switching to text-only`);
6448
6637
  this.onDone();
6449
6638
  this.startProbe();
6450
6639
  } else if (!this.down) {
6451
- log12.warn(`cartesia: ${JSON.stringify(m)}`);
6640
+ log13.warn(`cartesia: ${JSON.stringify(m)}`);
6452
6641
  }
6453
6642
  }
6454
6643
  };
@@ -6462,7 +6651,7 @@ var CartesiaTTS = class _CartesiaTTS {
6462
6651
  this.consecutiveOk = 0;
6463
6652
  this.stopProbe();
6464
6653
  const downMs = this.downAt ? now3() - this.downAt : 0;
6465
- (downMs < 2e3 ? log12.debug : log12.info)(`TTS recovered${downMs ? ` (down ${downMs}ms)` : ""}`);
6654
+ (downMs < 2e3 ? log13.debug : log13.info)(`TTS recovered${downMs ? ` (down ${downMs}ms)` : ""}`);
6466
6655
  }
6467
6656
  /** Ensure the WS is open before sending — reconnects if idle-closed. */
6468
6657
  async ensureConnected() {