@livx.cc/agentx 0.97.9 → 0.98.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli.js +347 -106
- package/dist/cli.js.map +1 -1
- package/dist/index.d.ts +15 -1
- package/dist/index.js +224 -30
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/dist/index.d.ts
CHANGED
|
@@ -894,6 +894,9 @@ declare class DuplexAgentOptions {
|
|
|
894
894
|
/** Voice register: 'neutral' = clean spoken style; 'conversational' = human-like — fillers,
|
|
895
895
|
* backchannels, impulsive first reactions before content (mimics real duplex conversation). */
|
|
896
896
|
voiceStyle: 'neutral' | 'conversational';
|
|
897
|
+
/** Teach the model to emit inline `[emotion]` tags for Cartesia emotion control. Only set when the
|
|
898
|
+
* TTS actually speaks them — text-duplex (no TTS) would otherwise print literal tags. */
|
|
899
|
+
emotionTags: boolean;
|
|
897
900
|
/** Awaited BEFORE a worker spawns — open a per-task checkpoint frame, audit, etc.
|
|
898
901
|
* (post-spawn would race the worker's first edits). */
|
|
899
902
|
onTaskStart?: (id: string, label: string) => void | Promise<void>;
|
|
@@ -949,6 +952,7 @@ declare class DuplexAgent {
|
|
|
949
952
|
private turnDispatched;
|
|
950
953
|
private turnBriefs;
|
|
951
954
|
private spokeThisTurn;
|
|
955
|
+
private heldThisTurn;
|
|
952
956
|
private nudging;
|
|
953
957
|
private reflexBuf;
|
|
954
958
|
private reflexForwarded;
|
|
@@ -1275,6 +1279,11 @@ declare class VoiceEngineOptions {
|
|
|
1275
1279
|
* speech at all, an audible hiccup. Default OFF: the genuine-gated STT partial is the
|
|
1276
1280
|
* mechanism-correct pause trigger; enable only if barge-in onset feels sluggish in a clean-AEC room. */
|
|
1277
1281
|
overlapEnergyHold: boolean;
|
|
1282
|
+
/** Map inline `[emotion]` tags (emitted by the model, prompt-taught) into Cartesia inline emotion
|
|
1283
|
+
* tags in the spoken transcript (sonic-3 stitches the prosody). false = strip them silently. */
|
|
1284
|
+
emotions: boolean;
|
|
1285
|
+
/** Show the `[emotion]` tags in the on-screen echo (debug). false = hide (spoken-only). */
|
|
1286
|
+
showEmotions: boolean;
|
|
1278
1287
|
}
|
|
1279
1288
|
declare class VoiceEngine {
|
|
1280
1289
|
options: VoiceEngineOptions;
|
|
@@ -1305,11 +1314,14 @@ declare class VoiceEngine {
|
|
|
1305
1314
|
private resumeTimer;
|
|
1306
1315
|
private turnStartAt;
|
|
1307
1316
|
private uttQueue;
|
|
1317
|
+
private emo;
|
|
1308
1318
|
constructor(options?: Partial<VoiceEngineOptions>);
|
|
1309
1319
|
start(): Promise<void>;
|
|
1310
1320
|
get usingAec(): boolean;
|
|
1311
1321
|
/** Flip barge-in at runtime (e.g. the mic fell back to non-VPIO → go half-duplex so echo can't leak). */
|
|
1312
1322
|
setBargeIn(on: boolean): void;
|
|
1323
|
+
/** Show/hide the `[emotion]` debug tags in the echo (next turn's stream picks it up). */
|
|
1324
|
+
setShowEmotions(on: boolean): void;
|
|
1313
1325
|
private idleWaiters;
|
|
1314
1326
|
private setState;
|
|
1315
1327
|
/** Resolve when the engine is no longer speaking (immediate if already idle). */
|
|
@@ -1318,7 +1330,9 @@ declare class VoiceEngine {
|
|
|
1318
1330
|
* `ack` speaks the configured micro-ack as the context opener (utterance path only —
|
|
1319
1331
|
* masks LLM TTFT; re-voice turns begun by their first delta skip it). */
|
|
1320
1332
|
beginSpeech(ack?: boolean): void;
|
|
1321
|
-
|
|
1333
|
+
/** Feed a spoken delta. Returns the on-screen echo text (emotion tags shown/hidden per config) so the
|
|
1334
|
+
* host renders the SAME stream that was parsed for TTS — no second, state-doubling parse. */
|
|
1335
|
+
speakDelta(text: string): string;
|
|
1322
1336
|
/** close the spoken turn (idempotent); stays audible until ALL audio arrived AND playback drains */
|
|
1323
1337
|
endSpeech(): void;
|
|
1324
1338
|
/** text of the reply cut by the last barge-in — consumed by the host to tell the model what
|
package/dist/index.js
CHANGED
|
@@ -4648,6 +4648,168 @@ function digestRun(messages, maxChars) {
|
|
|
4648
4648
|
import { MemFilesystem as MemFilesystem2 } from "@livx.cc/wcli/core";
|
|
4649
4649
|
init_logging();
|
|
4650
4650
|
|
|
4651
|
+
// src/voice/emotion.ts
|
|
4652
|
+
init_logging();
|
|
4653
|
+
var log9 = forComponent("Emotion");
|
|
4654
|
+
var EMOTIONS = [
|
|
4655
|
+
// primary (best results)
|
|
4656
|
+
"neutral",
|
|
4657
|
+
"angry",
|
|
4658
|
+
"excited",
|
|
4659
|
+
"content",
|
|
4660
|
+
"sad",
|
|
4661
|
+
"scared",
|
|
4662
|
+
// extended
|
|
4663
|
+
"happy",
|
|
4664
|
+
"enthusiastic",
|
|
4665
|
+
"elated",
|
|
4666
|
+
"triumphant",
|
|
4667
|
+
"amazed",
|
|
4668
|
+
"surprised",
|
|
4669
|
+
"flirtatious",
|
|
4670
|
+
"curious",
|
|
4671
|
+
"calm",
|
|
4672
|
+
"grateful",
|
|
4673
|
+
"affectionate",
|
|
4674
|
+
"sympathetic",
|
|
4675
|
+
"mysterious",
|
|
4676
|
+
"frustrated",
|
|
4677
|
+
"disgusted",
|
|
4678
|
+
"sarcastic",
|
|
4679
|
+
"disappointed",
|
|
4680
|
+
"hurt",
|
|
4681
|
+
"guilty",
|
|
4682
|
+
"bored",
|
|
4683
|
+
"tired",
|
|
4684
|
+
"nostalgic",
|
|
4685
|
+
"apologetic",
|
|
4686
|
+
"hesitant",
|
|
4687
|
+
"confused",
|
|
4688
|
+
"anxious",
|
|
4689
|
+
"panicked",
|
|
4690
|
+
"proud",
|
|
4691
|
+
"confident",
|
|
4692
|
+
"skeptical",
|
|
4693
|
+
"contemplative",
|
|
4694
|
+
"determined"
|
|
4695
|
+
];
|
|
4696
|
+
var VALID = new Set(EMOTIONS);
|
|
4697
|
+
var ALIASES = {
|
|
4698
|
+
cheerful: "happy",
|
|
4699
|
+
joyful: "happy",
|
|
4700
|
+
joy: "happy",
|
|
4701
|
+
glad: "happy",
|
|
4702
|
+
pleased: "happy",
|
|
4703
|
+
warm: "affectionate",
|
|
4704
|
+
thrilled: "excited",
|
|
4705
|
+
eager: "enthusiastic",
|
|
4706
|
+
ecstatic: "elated",
|
|
4707
|
+
euphoric: "elated",
|
|
4708
|
+
mad: "angry",
|
|
4709
|
+
furious: "angry",
|
|
4710
|
+
annoyed: "frustrated",
|
|
4711
|
+
irritated: "frustrated",
|
|
4712
|
+
agitated: "frustrated",
|
|
4713
|
+
shocked: "surprised",
|
|
4714
|
+
astonished: "amazed",
|
|
4715
|
+
wonder: "amazed",
|
|
4716
|
+
worried: "anxious",
|
|
4717
|
+
nervous: "anxious",
|
|
4718
|
+
afraid: "scared",
|
|
4719
|
+
alarmed: "panicked",
|
|
4720
|
+
unsure: "hesitant",
|
|
4721
|
+
uncertain: "hesitant",
|
|
4722
|
+
doubtful: "skeptical",
|
|
4723
|
+
suspicious: "skeptical",
|
|
4724
|
+
thoughtful: "contemplative",
|
|
4725
|
+
focused: "determined",
|
|
4726
|
+
serious: "determined",
|
|
4727
|
+
playful: "flirtatious",
|
|
4728
|
+
teasing: "flirtatious",
|
|
4729
|
+
ironic: "sarcastic",
|
|
4730
|
+
cheeky: "sarcastic",
|
|
4731
|
+
thankful: "grateful",
|
|
4732
|
+
sorry: "apologetic",
|
|
4733
|
+
down: "sad",
|
|
4734
|
+
melancholic: "sad",
|
|
4735
|
+
gloomy: "sad",
|
|
4736
|
+
peaceful: "calm",
|
|
4737
|
+
serene: "calm",
|
|
4738
|
+
relaxed: "calm",
|
|
4739
|
+
sleepy: "tired"
|
|
4740
|
+
};
|
|
4741
|
+
var NONVERBAL = { laughter: "laughter", laughs: "laughter", laugh: "laughter", laughing: "laughter" };
|
|
4742
|
+
function normalizeEmotion(raw) {
|
|
4743
|
+
const k = raw.trim().toLowerCase();
|
|
4744
|
+
if (VALID.has(k)) return k;
|
|
4745
|
+
return ALIASES[k] ?? null;
|
|
4746
|
+
}
|
|
4747
|
+
function resolveTag(raw) {
|
|
4748
|
+
const k = raw.trim().toLowerCase();
|
|
4749
|
+
if (NONVERBAL[k]) return { kind: "nonverbal", value: NONVERBAL[k] };
|
|
4750
|
+
const e = normalizeEmotion(k);
|
|
4751
|
+
return e ? { kind: "emotion", value: e } : null;
|
|
4752
|
+
}
|
|
4753
|
+
var TAG_RE = /\[([a-zA-Z][a-zA-Z ]{0,24})\]/g;
|
|
4754
|
+
var PARTIAL_RE = /\[[a-zA-Z ]*$/;
|
|
4755
|
+
var cartesiaTag = (t) => t.kind === "nonverbal" ? `[${t.value}]` : `<emotion value="${t.value}"/>`;
|
|
4756
|
+
var EmotionStream = class {
|
|
4757
|
+
constructor(show = true) {
|
|
4758
|
+
this.show = show;
|
|
4759
|
+
}
|
|
4760
|
+
show;
|
|
4761
|
+
buf = "";
|
|
4762
|
+
pending = null;
|
|
4763
|
+
feed(delta) {
|
|
4764
|
+
this.buf += delta;
|
|
4765
|
+
return this.drain(false);
|
|
4766
|
+
}
|
|
4767
|
+
flush() {
|
|
4768
|
+
return this.drain(true);
|
|
4769
|
+
}
|
|
4770
|
+
drain(final) {
|
|
4771
|
+
let body = this.buf;
|
|
4772
|
+
if (!final) {
|
|
4773
|
+
const p = body.match(PARTIAL_RE);
|
|
4774
|
+
if (p) {
|
|
4775
|
+
this.buf = p[0];
|
|
4776
|
+
body = body.slice(0, body.length - p[0].length);
|
|
4777
|
+
} else this.buf = "";
|
|
4778
|
+
} else this.buf = "";
|
|
4779
|
+
let speech = "", display = "", prose = "", last = 0;
|
|
4780
|
+
TAG_RE.lastIndex = 0;
|
|
4781
|
+
for (let m = TAG_RE.exec(body); m; m = TAG_RE.exec(body)) {
|
|
4782
|
+
this.emit(body.slice(last, m.index), (s, d, p) => {
|
|
4783
|
+
speech += s;
|
|
4784
|
+
display += d;
|
|
4785
|
+
prose += p;
|
|
4786
|
+
});
|
|
4787
|
+
const tag = resolveTag(m[1]);
|
|
4788
|
+
if (tag) {
|
|
4789
|
+
this.pending = tag;
|
|
4790
|
+
if (this.show) display += m[0];
|
|
4791
|
+
} else log9.debug(`dropping unknown emotion tag ${m[0]}`);
|
|
4792
|
+
last = m.index + m[0].length;
|
|
4793
|
+
}
|
|
4794
|
+
this.emit(body.slice(last), (s, d, p) => {
|
|
4795
|
+
speech += s;
|
|
4796
|
+
display += d;
|
|
4797
|
+
prose += p;
|
|
4798
|
+
});
|
|
4799
|
+
return { speech, display, prose };
|
|
4800
|
+
}
|
|
4801
|
+
/** Emit a prose span, flushing any pending tag onto its FRONT (only once real words appear). */
|
|
4802
|
+
emit(text, sink) {
|
|
4803
|
+
if (!text) return;
|
|
4804
|
+
let speech = text;
|
|
4805
|
+
if (this.pending && /[\p{L}\p{N}]/u.test(text)) {
|
|
4806
|
+
speech = cartesiaTag(this.pending) + text;
|
|
4807
|
+
this.pending = null;
|
|
4808
|
+
}
|
|
4809
|
+
sink(speech, text, text);
|
|
4810
|
+
}
|
|
4811
|
+
};
|
|
4812
|
+
|
|
4651
4813
|
// src/voice/spokenSplitter.ts
|
|
4652
4814
|
var OPEN = "<spoken>";
|
|
4653
4815
|
var CLOSE = "</spoken>";
|
|
@@ -4721,7 +4883,7 @@ var SpokenSplitter = class {
|
|
|
4721
4883
|
};
|
|
4722
4884
|
|
|
4723
4885
|
// src/duplex.ts
|
|
4724
|
-
var
|
|
4886
|
+
var log10 = forComponent("DuplexAgent");
|
|
4725
4887
|
function describeCall(call) {
|
|
4726
4888
|
const v = call.args && Object.values(call.args).find((x) => typeof x === "string" && x.trim());
|
|
4727
4889
|
const hint = v ? ` (${String(v).replace(/\s+/g, " ").trim().slice(0, 48)})` : "";
|
|
@@ -4760,6 +4922,9 @@ var DuplexAgentOptions = class {
|
|
|
4760
4922
|
/** Voice register: 'neutral' = clean spoken style; 'conversational' = human-like — fillers,
|
|
4761
4923
|
* backchannels, impulsive first reactions before content (mimics real duplex conversation). */
|
|
4762
4924
|
voiceStyle = "neutral";
|
|
4925
|
+
/** Teach the model to emit inline `[emotion]` tags for Cartesia emotion control. Only set when the
|
|
4926
|
+
* TTS actually speaks them — text-duplex (no TTS) would otherwise print literal tags. */
|
|
4927
|
+
emotionTags = false;
|
|
4763
4928
|
/** Awaited BEFORE a worker spawns — open a per-task checkpoint frame, audit, etc.
|
|
4764
4929
|
* (post-spawn would race the worker's first edits). */
|
|
4765
4930
|
onTaskStart;
|
|
@@ -4792,6 +4957,7 @@ var VOICE_SYSTEM_PROMPT = 'You are a spoken voice assistant \u2014 the user HEAR
|
|
|
4792
4957
|
var THINK_GUIDANCE = "\u2022 `Think` \u2014 your brain. A premium reasoning model, FAR more expensive than Act. Reserve it for open-ended architecture/design questions, or a problem Act already FAILED at. ALL implementation work \u2014 coding, refactoring, debugging, edge cases, tests \u2014 goes to Act; Act is highly capable. Never send the same work to both.";
|
|
4793
4958
|
var THINK_DISABLED_GUIDANCE = "(Think tier is not available \u2014 use Act for all escalations.)";
|
|
4794
4959
|
var VOICE_STYLE_CONVERSATIONAL = `Speak like a person in a live conversation, not an assistant reading a script. React first, then deliver: a quick impulsive beat ("oh nice", "hmm, hold on", "ah, got it") before the substance. Use contractions always. Vary sentence length \u2014 some very short. Light fillers and backchannels are fine ("mm-hm", "right", "let's see") but at most one per reply \u2014 never stack them. When you escalate to Act or Think, say it like a human would ("hang on, let me actually dig into that \u2014 gimme a minute") instead of announcing a task. When a result comes back, react to it like you just found out ("okay so \u2014 turns out\u2026"). Match the user's energy: a quick question gets a quick answer \u2014 a few words is a perfectly good turn. Prefer a short answer plus an offer ("want the details?") over covering everything. Never narrate your own mechanics (no "I will now act", no task ids out loud).`;
|
|
4960
|
+
var EMOTION_TAGS_GUIDANCE = `EMOTION: your voice is synthesized with emotion control. Prefix a sentence with an inline [emotion] tag, placed directly before the sentence it colors, to shape how it is spoken. Use it ONLY when the emotion genuinely fits the words (it amplifies real feeling, it cannot fake it) \u2014 do not tag every sentence; reserve it for moments that carry feeling, and vary which one you use. You may also drop [laughter] for a natural laugh. Available emotions: ${EMOTIONS.join(", ")}.`;
|
|
4795
4961
|
var DuplexAgent = class _DuplexAgent {
|
|
4796
4962
|
options;
|
|
4797
4963
|
voice;
|
|
@@ -4817,6 +4983,8 @@ var DuplexAgent = class _DuplexAgent {
|
|
|
4817
4983
|
// briefs dispatched this turn (detect identical re-dispatch)
|
|
4818
4984
|
spokeThisTurn = false;
|
|
4819
4985
|
// any non-empty text_delta streamed this turn
|
|
4986
|
+
heldThisTurn = false;
|
|
4987
|
+
// Hold called this turn → turn is INTENTIONALLY silent (suppress reflex text + no dead-air ack)
|
|
4820
4988
|
nudging = false;
|
|
4821
4989
|
// re-ack pass in flight: block ALL tools, prevent recursion
|
|
4822
4990
|
reflexBuf = "";
|
|
@@ -4861,7 +5029,7 @@ var DuplexAgent = class _DuplexAgent {
|
|
|
4861
5029
|
...new Set(workerToolNames.filter((n) => n.startsWith("mcp__")).map((n) => n.slice(5).split("__")[0]))
|
|
4862
5030
|
];
|
|
4863
5031
|
const workerMcp = mcpNames.length ? `, and it can use these MCP servers: ${[...new Set(mcpNames)].join(", ")}` + (mcpNames.some((n) => /browser/i.test(n)) ? ' \u2014 including driving a REAL browser (open tabs, navigate, click, screenshot), so answer "yes" if asked whether you can control/drive a browser and route an actual browse to Act' : "") : "";
|
|
4864
|
-
const prompt = VOICE_SYSTEM_PROMPT.replace("{{MEMORY_SLOT}}", memSlot).replace("{{THINK_SLOT}}", thinkSlot).replace("{{WORKER_WEB}}", workerWeb + workerMcp) + (o.voiceStyle === "conversational" ? "\n" + VOICE_STYLE_CONVERSATIONAL : "") + `
|
|
5032
|
+
const prompt = VOICE_SYSTEM_PROMPT.replace("{{MEMORY_SLOT}}", memSlot).replace("{{THINK_SLOT}}", thinkSlot).replace("{{WORKER_WEB}}", workerWeb + workerMcp) + (o.voiceStyle === "conversational" ? "\n" + VOICE_STYLE_CONVERSATIONAL : "") + (o.emotionTags ? "\n" + EMOTION_TAGS_GUIDANCE : "") + `
|
|
4865
5033
|
Today's date: ${(/* @__PURE__ */ new Date()).toDateString()}.`;
|
|
4866
5034
|
const tools = [
|
|
4867
5035
|
...o.reflexOptions?.tools ?? [],
|
|
@@ -4879,13 +5047,14 @@ Today's date: ${(/* @__PURE__ */ new Date()).toDateString()}.`;
|
|
|
4879
5047
|
confirm: host.confirm ? (p, m) => host.confirm(p, m) : void 0,
|
|
4880
5048
|
notify: (ev) => {
|
|
4881
5049
|
if (ev?.kind === "text_delta" && typeof ev.message === "string") {
|
|
5050
|
+
if (this.heldThisTurn) return;
|
|
4882
5051
|
if (this.fabricationCut) return;
|
|
4883
5052
|
const msg = ev.message;
|
|
4884
5053
|
this.reflexBuf += msg;
|
|
4885
5054
|
const m = this.reflexBuf.match(RESERVED_EVENT_MARKER) ?? this.reflexBuf.match(RESERVED_EVENT_OPENER);
|
|
4886
5055
|
if (m) {
|
|
4887
5056
|
this.fabricationCut = true;
|
|
4888
|
-
|
|
5057
|
+
log10.warn(`reflex fabricated a [task \u2026] event in its spoken stream \u2014 cutting it (kept ${m.index} chars)`);
|
|
4889
5058
|
const safe = this.reflexBuf.slice(this.reflexForwarded, m.index);
|
|
4890
5059
|
if (!safe) return;
|
|
4891
5060
|
if (safe.trim()) this.spokeThisTurn = true;
|
|
@@ -4948,6 +5117,7 @@ Today's date: ${(/* @__PURE__ */ new Date()).toDateString()}.`;
|
|
|
4948
5117
|
this.turnDispatched = false;
|
|
4949
5118
|
this.turnBriefs.clear();
|
|
4950
5119
|
this.spokeThisTurn = false;
|
|
5120
|
+
this.heldThisTurn = false;
|
|
4951
5121
|
this.reflexBuf = "";
|
|
4952
5122
|
this.reflexForwarded = 0;
|
|
4953
5123
|
this.fabricationCut = false;
|
|
@@ -4976,7 +5146,7 @@ Today's date: ${(/* @__PURE__ */ new Date()).toDateString()}.`;
|
|
|
4976
5146
|
* voice) and emits an empty `final`, so no text_delta ever streams. Both ship silence; both repair.
|
|
4977
5147
|
* Requires a host: without one there's no stream to detect speech on (and no one to speak to). */
|
|
4978
5148
|
get silentTurn() {
|
|
4979
|
-
return !!this.options.host && !this.spokeThisTurn;
|
|
5149
|
+
return !!this.options.host && !this.spokeThisTurn && !this.heldThisTurn;
|
|
4980
5150
|
}
|
|
4981
5151
|
/** A turn that voiced nothing is dead air. Re-prompt the reflex ONCE so the LLM itself voices a short
|
|
4982
5152
|
* line (no template). If it STILL says nothing, fall back to a minimal line so silence never ships.
|
|
@@ -4987,7 +5157,7 @@ Today's date: ${(/* @__PURE__ */ new Date()).toDateString()}.`;
|
|
|
4987
5157
|
try {
|
|
4988
5158
|
await this.voice.send(fallback ? "[reminder] You said nothing to the user this turn. Tell them, in ONE short spoken sentence, what just happened \u2014 no tools." : dispatched ? "[reminder] You dispatched a task but said nothing to the user. Say ONE short spoken acknowledgement now \u2014 no tools." : "[reminder] You said nothing to the user this turn. Give your ONE short spoken reply now \u2014 no tools.");
|
|
4989
5159
|
} catch (e) {
|
|
4990
|
-
|
|
5160
|
+
log10.warn(`ack nudge failed: ${e instanceof Error ? e.message : e}`);
|
|
4991
5161
|
} finally {
|
|
4992
5162
|
this.nudging = false;
|
|
4993
5163
|
}
|
|
@@ -5079,7 +5249,7 @@ Today's date: ${(/* @__PURE__ */ new Date()).toDateString()}.`;
|
|
|
5079
5249
|
buildBrief(brief, tier = "act", deliver = true) {
|
|
5080
5250
|
const recent = this.voice.transcript.filter((m) => (m.role === "user" || m.role === "assistant") && contentText(m.content).trim()).slice(-this.options.excerptTurns).map((m) => `${m.role}: ${contentText(m.content)}`).join("\n");
|
|
5081
5251
|
const verify = tier === "act" ? "\n\nBefore reporting done: re-read what you changed and check it against EVERY requirement above \u2014 fix any gap first. Your report is trusted without review." : "";
|
|
5082
|
-
const deliverContract = deliver ? "\n\n## DELIVER (spoken delivery)\nYou are reporting back to a user who is LISTENING. Stream your work normally \u2014 your prose is the written work record and detail, and is NOT spoken. Wrap anything the user should HEAR in <spoken>\u2026</spoken> tags. LEAD WITH the actual content they asked for: if they asked for a specific piece of content \u2014 a value, a name, the actual lines, the writing itself \u2014 that content goes INSIDE the <spoken> tags, not a remark about it. Your FIRST <spoken> segment is substantive \u2014 never a greeting or an acknowledgement (the front-end has already acked; do not double-ack). Keep spoken text concise and natural for the ear: short sentences, no markdown." : "";
|
|
5252
|
+
const deliverContract = deliver ? "\n\n## DELIVER (spoken delivery)\nYou are reporting back to a user who is LISTENING. Stream your work normally \u2014 your prose is the written work record and detail, and is NOT spoken. Wrap anything the user should HEAR in <spoken>\u2026</spoken> tags. LEAD WITH the actual content they asked for: if they asked for a specific piece of content \u2014 a value, a name, the actual lines, the writing itself \u2014 that content goes INSIDE the <spoken> tags, not a remark about it. Your FIRST <spoken> segment is substantive \u2014 never a greeting or an acknowledgement (the front-end has already acked; do not double-ack). Keep spoken text concise and natural for the ear: short sentences, no markdown." + (this.options.emotionTags ? " Inside <spoken>, you may prefix a sentence with an inline [emotion] tag (e.g. [excited], [curious]) to color how it is voiced \u2014 only when it genuinely fits, and vary it; [laughter] gives a natural laugh." : "") : "";
|
|
5083
5253
|
return (recent ? `${brief}
|
|
5084
5254
|
|
|
5085
5255
|
## Recent conversation (for context)
|
|
@@ -5195,7 +5365,7 @@ Another agent just implemented the above. Independently check the CURRENT state
|
|
|
5195
5365
|
this.notify("task_verify", `task ${id}: verifying`, { id });
|
|
5196
5366
|
const cres = await new Agent(checkerOpts).run(checkBrief);
|
|
5197
5367
|
if (cres.finishReason !== "stop") {
|
|
5198
|
-
|
|
5368
|
+
log10.warn(`task ${id}: verify inconclusive (${cres.finishReason})`);
|
|
5199
5369
|
this.notify("task_verify", `task ${id}: verify inconclusive (${cres.finishReason})`, { id, finishReason: cres.finishReason });
|
|
5200
5370
|
}
|
|
5201
5371
|
const sum = (a = 0, b = 0) => a + b;
|
|
@@ -5331,7 +5501,7 @@ Another agent just implemented the above. Independently check the CURRENT state
|
|
|
5331
5501
|
rec.status = "done";
|
|
5332
5502
|
rec.result = res.text;
|
|
5333
5503
|
const incomplete = res.finishReason !== "stop";
|
|
5334
|
-
|
|
5504
|
+
log10.verbose(`task ${id} done (${res.steps} steps${incomplete ? `, INCOMPLETE: ${res.finishReason}` : ""})`);
|
|
5335
5505
|
this.notify("task_done", `task ${id} (${rec.label}) completed`, {
|
|
5336
5506
|
id,
|
|
5337
5507
|
text: res.text,
|
|
@@ -5357,7 +5527,7 @@ Another agent just implemented the above. Independently check the CURRENT state
|
|
|
5357
5527
|
this.dropAsk(rec.id);
|
|
5358
5528
|
rec.status = "error";
|
|
5359
5529
|
rec.result = msg;
|
|
5360
|
-
|
|
5530
|
+
log10.warn(`task ${rec.id} failed: ${msg}`);
|
|
5361
5531
|
this.notify("task_error", `task ${rec.id} (${rec.label}) failed: ${msg}`);
|
|
5362
5532
|
this.queueRevoice(this.integrationPrompt(rec, "error", msg, "error"), true);
|
|
5363
5533
|
}
|
|
@@ -5537,6 +5707,7 @@ Another agent just implemented the above. Independently check the CURRENT state
|
|
|
5537
5707
|
}
|
|
5538
5708
|
},
|
|
5539
5709
|
run: async ({ filler }) => {
|
|
5710
|
+
this.heldThisTurn = true;
|
|
5540
5711
|
if (filler) this.notify("hold_filler", String(filler));
|
|
5541
5712
|
return "Holding \u2014 listening for the rest of the user's thought. Do not respond further this turn.";
|
|
5542
5713
|
}
|
|
@@ -5720,7 +5891,7 @@ init_logging();
|
|
|
5720
5891
|
|
|
5721
5892
|
// src/voice/engine.ts
|
|
5722
5893
|
init_logging();
|
|
5723
|
-
var
|
|
5894
|
+
var log11 = forComponent("VoiceEngine");
|
|
5724
5895
|
var now = () => performance.now();
|
|
5725
5896
|
var forSpeech = (t) => t.replace(/[*_`#]+/g, "").replace(/^[ \t]*[-•]\s+/gm, "").replace(/\s*[\u2013\u2014]\s*/g, ", ").replace(/[\u2010\u2011]/g, "-").replace(/\s*\|\s*/g, ", ").replace(/(\d)\s+%/g, "$1%").replace(/\.{3,}/g, ".");
|
|
5726
5897
|
var VoiceEngineOptions = class {
|
|
@@ -5788,6 +5959,11 @@ var VoiceEngineOptions = class {
|
|
|
5788
5959
|
* speech at all, an audible hiccup. Default OFF: the genuine-gated STT partial is the
|
|
5789
5960
|
* mechanism-correct pause trigger; enable only if barge-in onset feels sluggish in a clean-AEC room. */
|
|
5790
5961
|
overlapEnergyHold = false;
|
|
5962
|
+
/** Map inline `[emotion]` tags (emitted by the model, prompt-taught) into Cartesia inline emotion
|
|
5963
|
+
* tags in the spoken transcript (sonic-3 stitches the prosody). false = strip them silently. */
|
|
5964
|
+
emotions = true;
|
|
5965
|
+
/** Show the `[emotion]` tags in the on-screen echo (debug). false = hide (spoken-only). */
|
|
5966
|
+
showEmotions = false;
|
|
5791
5967
|
};
|
|
5792
5968
|
var VoiceEngine = class _VoiceEngine {
|
|
5793
5969
|
options;
|
|
@@ -5833,6 +6009,9 @@ var VoiceEngine = class _VoiceEngine {
|
|
|
5833
6009
|
// Central speech queue (above the TTS context): complete worker utterances serialize into ONE
|
|
5834
6010
|
// playback stream, one-at-a-time, never splicing into the live reflex's open utterance.
|
|
5835
6011
|
uttQueue = [];
|
|
6012
|
+
// Per-turn emotion-tag parser (reset on beginSpeech) — converts `[emotion]` → Cartesia inline tags
|
|
6013
|
+
// for TTS, tracks tag-free prose for echo discrimination, and surfaces display text for the screen.
|
|
6014
|
+
emo = null;
|
|
5836
6015
|
constructor(options) {
|
|
5837
6016
|
this.options = { ...new VoiceEngineOptions(), ...options };
|
|
5838
6017
|
const o = this.options;
|
|
@@ -5850,7 +6029,7 @@ var VoiceEngine = class _VoiceEngine {
|
|
|
5850
6029
|
this.stt.onLevel = (rms) => this.handleLevel(rms);
|
|
5851
6030
|
await Promise.all([this.tts.connect(), this.stt.start()]);
|
|
5852
6031
|
this.setState("listening");
|
|
5853
|
-
|
|
6032
|
+
log11.debug(`voice I/O up (${this.stt.usingAec ? "AEC" : "heuristic echo"} capture)`);
|
|
5854
6033
|
}
|
|
5855
6034
|
get usingAec() {
|
|
5856
6035
|
return this.stt.usingAec;
|
|
@@ -5859,6 +6038,10 @@ var VoiceEngine = class _VoiceEngine {
|
|
|
5859
6038
|
setBargeIn(on) {
|
|
5860
6039
|
this.options.bargeIn = on;
|
|
5861
6040
|
}
|
|
6041
|
+
/** Show/hide the `[emotion]` debug tags in the echo (next turn's stream picks it up). */
|
|
6042
|
+
setShowEmotions(on) {
|
|
6043
|
+
this.options.showEmotions = on;
|
|
6044
|
+
}
|
|
5862
6045
|
idleWaiters = [];
|
|
5863
6046
|
setState(s) {
|
|
5864
6047
|
if (this.state === s) return;
|
|
@@ -5890,6 +6073,7 @@ var VoiceEngine = class _VoiceEngine {
|
|
|
5890
6073
|
this.ctxOpen = true;
|
|
5891
6074
|
this.spokeDeltas = false;
|
|
5892
6075
|
this.reply = "";
|
|
6076
|
+
this.emo = this.options.emotions ? new EmotionStream(this.options.showEmotions) : null;
|
|
5893
6077
|
this.echoWords = new Set(this.words(this.prevReply));
|
|
5894
6078
|
this.tts.newContext();
|
|
5895
6079
|
if (ack && this.options.ackPhrase) {
|
|
@@ -5900,21 +6084,31 @@ var VoiceEngine = class _VoiceEngine {
|
|
|
5900
6084
|
if (!this.turnStartAt) this.turnStartAt = now();
|
|
5901
6085
|
this.setState("thinking");
|
|
5902
6086
|
}
|
|
6087
|
+
/** Feed a spoken delta. Returns the on-screen echo text (emotion tags shown/hidden per config) so the
|
|
6088
|
+
* host renders the SAME stream that was parsed for TTS — no second, state-doubling parse. */
|
|
5903
6089
|
speakDelta(text) {
|
|
5904
|
-
if (this.interrupted) return;
|
|
6090
|
+
if (this.interrupted) return "";
|
|
5905
6091
|
if (!this.speaking || !this.ctxOpen) this.beginSpeech();
|
|
5906
|
-
this.
|
|
6092
|
+
const { speech, display, prose } = this.emo ? this.emo.feed(text) : { speech: text, display: text, prose: text };
|
|
6093
|
+
this.reply += prose;
|
|
5907
6094
|
for (const w of this.words(this.reply)) this.echoWords.add(w);
|
|
5908
|
-
this.tts.speak(forSpeech(
|
|
5909
|
-
if (!this.spokeDeltas && this.turnStartAt)
|
|
6095
|
+
this.tts.speak(forSpeech(speech), true);
|
|
6096
|
+
if (!this.spokeDeltas && this.turnStartAt) log11.debug(`ttft: ${Math.round(now() - this.turnStartAt)}ms`);
|
|
5910
6097
|
this.spokeDeltas = true;
|
|
5911
6098
|
this.setState("speaking");
|
|
6099
|
+
return display;
|
|
5912
6100
|
}
|
|
5913
6101
|
/** close the spoken turn (idempotent); stays audible until ALL audio arrived AND playback drains */
|
|
5914
6102
|
endSpeech() {
|
|
5915
6103
|
this.interrupted = false;
|
|
5916
6104
|
if (!this.speaking) return;
|
|
5917
6105
|
this.ctxOpen = false;
|
|
6106
|
+
if (this.emo) {
|
|
6107
|
+
const t = this.emo.flush();
|
|
6108
|
+
this.emo = null;
|
|
6109
|
+
if (t.prose) this.reply += t.prose;
|
|
6110
|
+
if (t.speech) this.tts.speak(forSpeech(t.speech), true);
|
|
6111
|
+
}
|
|
5918
6112
|
if (this.reply) this.prevReply = this.reply;
|
|
5919
6113
|
const settle = () => {
|
|
5920
6114
|
if (this.ctxOpen) {
|
|
@@ -5927,7 +6121,7 @@ var VoiceEngine = class _VoiceEngine {
|
|
|
5927
6121
|
}
|
|
5928
6122
|
this.drainTimer = null;
|
|
5929
6123
|
this.speaking = false;
|
|
5930
|
-
if (this.turnStartAt)
|
|
6124
|
+
if (this.turnStartAt) log11.debug(`turn: ${Math.round(now() - this.turnStartAt)}ms (incl. playback)`);
|
|
5931
6125
|
this.echoUntil = now() + 2500;
|
|
5932
6126
|
if (!this.usingAec) this.stt.reset();
|
|
5933
6127
|
this.setState("listening");
|
|
@@ -6119,7 +6313,7 @@ var VoiceEngine = class _VoiceEngine {
|
|
|
6119
6313
|
this.pendingUtt = this.mergeUtterance(this.pendingUtt, text);
|
|
6120
6314
|
if (this.pendingTimer) clearTimeout(this.pendingTimer);
|
|
6121
6315
|
if (this.options.incompleteMergeMs && this.looksIncomplete(this.pendingUtt)) {
|
|
6122
|
-
|
|
6316
|
+
log11.verbose(`hold: incomplete utterance "${this.pendingUtt.slice(-40)}"`);
|
|
6123
6317
|
this.options.onHold();
|
|
6124
6318
|
if (this.options.holdFiller && !this.speaking) {
|
|
6125
6319
|
this.beginSpeech();
|
|
@@ -6218,7 +6412,7 @@ async function resolveAuth(auth) {
|
|
|
6218
6412
|
}
|
|
6219
6413
|
|
|
6220
6414
|
// src/voice/soniox.ts
|
|
6221
|
-
var
|
|
6415
|
+
var log12 = forComponent("SonioxSTT");
|
|
6222
6416
|
var now2 = () => performance.now();
|
|
6223
6417
|
var SonioxSTTOptions = class {
|
|
6224
6418
|
auth = "";
|
|
@@ -6287,9 +6481,9 @@ var SonioxSTT = class {
|
|
|
6287
6481
|
this.ws.onmessage = (ev) => this.handle(JSON.parse(String(ev.data)));
|
|
6288
6482
|
this.ws.onclose = (ev) => {
|
|
6289
6483
|
if (this.stopped) return;
|
|
6290
|
-
|
|
6484
|
+
log12.warn(`soniox ws closed (${ev.code} ${ev.reason || ""}) \u2014 reconnecting`);
|
|
6291
6485
|
this.reset();
|
|
6292
|
-
this.connectWs().catch((e) =>
|
|
6486
|
+
this.connectWs().catch((e) => log12.error(`soniox reconnect failed: ${e.message}`));
|
|
6293
6487
|
};
|
|
6294
6488
|
}
|
|
6295
6489
|
async start() {
|
|
@@ -6299,7 +6493,7 @@ var SonioxSTT = class {
|
|
|
6299
6493
|
this.endpointTimer = setInterval(() => {
|
|
6300
6494
|
const combined = (this.finalText + this.partialText).trim();
|
|
6301
6495
|
if (!combined || now2() - this.lastChangeAt < this.options.silenceEndpointMs) return;
|
|
6302
|
-
if (this.firstTokenAt)
|
|
6496
|
+
if (this.firstTokenAt) log12.debug(`stt: ${Math.round(now2() - this.firstTokenAt)}ms first-token\u2192silence-endpoint, "${combined.slice(0, 60)}"`);
|
|
6303
6497
|
this.reset();
|
|
6304
6498
|
this.onUtterance(combined, now2());
|
|
6305
6499
|
}, 120);
|
|
@@ -6311,7 +6505,7 @@ var SonioxSTT = class {
|
|
|
6311
6505
|
if (this.stopped) return;
|
|
6312
6506
|
const ref = this.lastChunkAt || this.startedChunksAt;
|
|
6313
6507
|
if (now2() - ref > noAudioMs) {
|
|
6314
|
-
|
|
6508
|
+
log12.error(`stt: no mic audio for >${Math.round(noAudioMs / 1e3)}s \u2014 capture device stopped delivering`);
|
|
6315
6509
|
this.onFatal("microphone stopped delivering audio (try a different input device, e.g. AirPods, or check System Settings \u2192 Sound \u2192 Input)");
|
|
6316
6510
|
this.stop();
|
|
6317
6511
|
}
|
|
@@ -6331,7 +6525,7 @@ var SonioxSTT = class {
|
|
|
6331
6525
|
});
|
|
6332
6526
|
}
|
|
6333
6527
|
handle(m) {
|
|
6334
|
-
if (m.error_message) return
|
|
6528
|
+
if (m.error_message) return log12.error(`soniox: ${m.error_message}`);
|
|
6335
6529
|
let endpoint = false;
|
|
6336
6530
|
for (const t of m.tokens ?? []) {
|
|
6337
6531
|
if (t.text === "<end>") endpoint = true;
|
|
@@ -6347,7 +6541,7 @@ var SonioxSTT = class {
|
|
|
6347
6541
|
this.onPartial(combined);
|
|
6348
6542
|
if (endpoint && this.finalText.trim()) {
|
|
6349
6543
|
const utterance = this.finalText.trim();
|
|
6350
|
-
if (this.firstTokenAt)
|
|
6544
|
+
if (this.firstTokenAt) log12.debug(`stt: ${Math.round(now2() - this.firstTokenAt)}ms first-token\u2192endpoint, "${utterance.slice(0, 60)}"`);
|
|
6351
6545
|
this.reset();
|
|
6352
6546
|
this.onUtterance(utterance, now2());
|
|
6353
6547
|
}
|
|
@@ -6370,7 +6564,7 @@ var SonioxSTT = class {
|
|
|
6370
6564
|
|
|
6371
6565
|
// src/voice/cartesia.ts
|
|
6372
6566
|
init_logging();
|
|
6373
|
-
var
|
|
6567
|
+
var log13 = forComponent("CartesiaTTS");
|
|
6374
6568
|
var now3 = () => performance.now();
|
|
6375
6569
|
var CartesiaTTSOptions = class {
|
|
6376
6570
|
auth = "";
|
|
@@ -6420,9 +6614,9 @@ var CartesiaTTS = class _CartesiaTTS {
|
|
|
6420
6614
|
this.ws.onerror = (e) => rej(new Error(`cartesia ws: ${e.message || "connect failed"}`));
|
|
6421
6615
|
});
|
|
6422
6616
|
this.ws.onclose = (ev) => {
|
|
6423
|
-
|
|
6617
|
+
log13.warn(`cartesia ws closed (${ev.code} ${ev.reason || ""})`);
|
|
6424
6618
|
if (!this.closed) {
|
|
6425
|
-
this.connecting = this.doConnect().catch((e) =>
|
|
6619
|
+
this.connecting = this.doConnect().catch((e) => log13.error(`cartesia reconnect failed: ${e.message}`));
|
|
6426
6620
|
}
|
|
6427
6621
|
};
|
|
6428
6622
|
this.ws.onmessage = (ev) => {
|
|
@@ -6444,11 +6638,11 @@ var CartesiaTTS = class _CartesiaTTS {
|
|
|
6444
6638
|
this.down = true;
|
|
6445
6639
|
this.downAt = now3();
|
|
6446
6640
|
this.consecutiveOk = 0;
|
|
6447
|
-
|
|
6641
|
+
log13.warn(`TTS circuit breaker open \u2014 ${this.consecutiveErrors} consecutive errors, switching to text-only`);
|
|
6448
6642
|
this.onDone();
|
|
6449
6643
|
this.startProbe();
|
|
6450
6644
|
} else if (!this.down) {
|
|
6451
|
-
|
|
6645
|
+
log13.warn(`cartesia: ${JSON.stringify(m)}`);
|
|
6452
6646
|
}
|
|
6453
6647
|
}
|
|
6454
6648
|
};
|
|
@@ -6462,7 +6656,7 @@ var CartesiaTTS = class _CartesiaTTS {
|
|
|
6462
6656
|
this.consecutiveOk = 0;
|
|
6463
6657
|
this.stopProbe();
|
|
6464
6658
|
const downMs = this.downAt ? now3() - this.downAt : 0;
|
|
6465
|
-
(downMs < 2e3 ?
|
|
6659
|
+
(downMs < 2e3 ? log13.debug : log13.info)(`TTS recovered${downMs ? ` (down ${downMs}ms)` : ""}`);
|
|
6466
6660
|
}
|
|
6467
6661
|
/** Ensure the WS is open before sending — reconnects if idle-closed. */
|
|
6468
6662
|
async ensureConnected() {
|