@livx.cc/agentx 0.97.7 → 0.98.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli.js +360 -108
- package/dist/cli.js.map +1 -1
- package/dist/index.d.ts +23 -1
- package/dist/index.js +234 -32
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/dist/index.d.ts
CHANGED
|
@@ -858,6 +858,10 @@ interface TaskRecord {
|
|
|
858
858
|
/** Per-worker `<spoken>` splitter — the worker OWNS delivery: spoken segments stream during its run.
|
|
859
859
|
* Read at settle (spokeAny) to decide the no-spoken fallback. */
|
|
860
860
|
splitter?: SpokenSplitter;
|
|
861
|
+
/** Set when the user barged in / took the floor while this task was in flight: its remaining SPOKEN
|
|
862
|
+
* delivery is suppressed (don't talk over the new topic), but its full result still lands in the
|
|
863
|
+
* transcript so the reflex can surface it on request. See parkInFlightDeliveries(). */
|
|
864
|
+
deliveryParked?: boolean;
|
|
861
865
|
}
|
|
862
866
|
type WorkerTier = 'act' | 'think';
|
|
863
867
|
declare class DuplexAgentOptions {
|
|
@@ -890,6 +894,9 @@ declare class DuplexAgentOptions {
|
|
|
890
894
|
/** Voice register: 'neutral' = clean spoken style; 'conversational' = human-like — fillers,
|
|
891
895
|
* backchannels, impulsive first reactions before content (mimics real duplex conversation). */
|
|
892
896
|
voiceStyle: 'neutral' | 'conversational';
|
|
897
|
+
/** Teach the model to emit inline `[emotion]` tags for Cartesia emotion control. Only set when the
|
|
898
|
+
* TTS actually speaks them — text-duplex (no TTS) would otherwise print literal tags. */
|
|
899
|
+
emotionTags: boolean;
|
|
893
900
|
/** Awaited BEFORE a worker spawns — open a per-task checkpoint frame, audit, etc.
|
|
894
901
|
* (post-spawn would race the worker's first edits). */
|
|
895
902
|
onTaskStart?: (id: string, label: string) => void | Promise<void>;
|
|
@@ -998,6 +1005,11 @@ declare class DuplexAgent {
|
|
|
998
1005
|
send(content: MessageContent): Promise<RunResult>;
|
|
999
1006
|
/** Cancel a running background task — shared by the CancelTask tool and the CLI /tasks picker. */
|
|
1000
1007
|
cancelTask(id: string): string;
|
|
1008
|
+
/** Barge-in: the user took the floor while task(s) were running. Suppress those tasks' remaining SPOKEN
|
|
1009
|
+
* delivery so a superseded topic never talks over the new one (the debt-after-jokes regression). The
|
|
1010
|
+
* tasks keep running and still fold their result into the transcript — recoverable, just not spoken.
|
|
1011
|
+
* Returns the parked ids (for logging). Does NOT cancel: that's a deliberate reflex/user action. */
|
|
1012
|
+
parkInFlightDeliveries(): string[];
|
|
1001
1013
|
/** Resolve when all queued voice turns AND all in-flight worker tasks have settled (tests, graceful shutdown). */
|
|
1002
1014
|
idle(): Promise<void>;
|
|
1003
1015
|
/** Promise-chain mutex: turns run strictly one at a time; a failed turn doesn't poison the chain. */
|
|
@@ -1266,6 +1278,11 @@ declare class VoiceEngineOptions {
|
|
|
1266
1278
|
* speech at all, an audible hiccup. Default OFF: the genuine-gated STT partial is the
|
|
1267
1279
|
* mechanism-correct pause trigger; enable only if barge-in onset feels sluggish in a clean-AEC room. */
|
|
1268
1280
|
overlapEnergyHold: boolean;
|
|
1281
|
+
/** Map inline `[emotion]` tags (emitted by the model, prompt-taught) into Cartesia inline emotion
|
|
1282
|
+
* tags in the spoken transcript (sonic-3 stitches the prosody). false = strip them silently. */
|
|
1283
|
+
emotions: boolean;
|
|
1284
|
+
/** Show the `[emotion]` tags in the on-screen echo (debug). false = hide (spoken-only). */
|
|
1285
|
+
showEmotions: boolean;
|
|
1269
1286
|
}
|
|
1270
1287
|
declare class VoiceEngine {
|
|
1271
1288
|
options: VoiceEngineOptions;
|
|
@@ -1296,11 +1313,14 @@ declare class VoiceEngine {
|
|
|
1296
1313
|
private resumeTimer;
|
|
1297
1314
|
private turnStartAt;
|
|
1298
1315
|
private uttQueue;
|
|
1316
|
+
private emo;
|
|
1299
1317
|
constructor(options?: Partial<VoiceEngineOptions>);
|
|
1300
1318
|
start(): Promise<void>;
|
|
1301
1319
|
get usingAec(): boolean;
|
|
1302
1320
|
/** Flip barge-in at runtime (e.g. the mic fell back to non-VPIO → go half-duplex so echo can't leak). */
|
|
1303
1321
|
setBargeIn(on: boolean): void;
|
|
1322
|
+
/** Show/hide the `[emotion]` debug tags in the echo (next turn's stream picks it up). */
|
|
1323
|
+
setShowEmotions(on: boolean): void;
|
|
1304
1324
|
private idleWaiters;
|
|
1305
1325
|
private setState;
|
|
1306
1326
|
/** Resolve when the engine is no longer speaking (immediate if already idle). */
|
|
@@ -1309,7 +1329,9 @@ declare class VoiceEngine {
|
|
|
1309
1329
|
* `ack` speaks the configured micro-ack as the context opener (utterance path only —
|
|
1310
1330
|
* masks LLM TTFT; re-voice turns begun by their first delta skip it). */
|
|
1311
1331
|
beginSpeech(ack?: boolean): void;
|
|
1312
|
-
|
|
1332
|
+
/** Feed a spoken delta. Returns the on-screen echo text (emotion tags shown/hidden per config) so the
|
|
1333
|
+
* host renders the SAME stream that was parsed for TTS — no second, state-doubling parse. */
|
|
1334
|
+
speakDelta(text: string): string;
|
|
1313
1335
|
/** close the spoken turn (idempotent); stays audible until ALL audio arrived AND playback drains */
|
|
1314
1336
|
endSpeech(): void;
|
|
1315
1337
|
/** text of the reply cut by the last barge-in — consumed by the host to tell the model what
|
package/dist/index.js
CHANGED
|
@@ -4648,6 +4648,168 @@ function digestRun(messages, maxChars) {
|
|
|
4648
4648
|
import { MemFilesystem as MemFilesystem2 } from "@livx.cc/wcli/core";
|
|
4649
4649
|
init_logging();
|
|
4650
4650
|
|
|
4651
|
+
// src/voice/emotion.ts
|
|
4652
|
+
init_logging();
|
|
4653
|
+
var log9 = forComponent("Emotion");
|
|
4654
|
+
var EMOTIONS = [
|
|
4655
|
+
// primary (best results)
|
|
4656
|
+
"neutral",
|
|
4657
|
+
"angry",
|
|
4658
|
+
"excited",
|
|
4659
|
+
"content",
|
|
4660
|
+
"sad",
|
|
4661
|
+
"scared",
|
|
4662
|
+
// extended
|
|
4663
|
+
"happy",
|
|
4664
|
+
"enthusiastic",
|
|
4665
|
+
"elated",
|
|
4666
|
+
"triumphant",
|
|
4667
|
+
"amazed",
|
|
4668
|
+
"surprised",
|
|
4669
|
+
"flirtatious",
|
|
4670
|
+
"curious",
|
|
4671
|
+
"calm",
|
|
4672
|
+
"grateful",
|
|
4673
|
+
"affectionate",
|
|
4674
|
+
"sympathetic",
|
|
4675
|
+
"mysterious",
|
|
4676
|
+
"frustrated",
|
|
4677
|
+
"disgusted",
|
|
4678
|
+
"sarcastic",
|
|
4679
|
+
"disappointed",
|
|
4680
|
+
"hurt",
|
|
4681
|
+
"guilty",
|
|
4682
|
+
"bored",
|
|
4683
|
+
"tired",
|
|
4684
|
+
"nostalgic",
|
|
4685
|
+
"apologetic",
|
|
4686
|
+
"hesitant",
|
|
4687
|
+
"confused",
|
|
4688
|
+
"anxious",
|
|
4689
|
+
"panicked",
|
|
4690
|
+
"proud",
|
|
4691
|
+
"confident",
|
|
4692
|
+
"skeptical",
|
|
4693
|
+
"contemplative",
|
|
4694
|
+
"determined"
|
|
4695
|
+
];
|
|
4696
|
+
var VALID = new Set(EMOTIONS);
|
|
4697
|
+
var ALIASES = {
|
|
4698
|
+
cheerful: "happy",
|
|
4699
|
+
joyful: "happy",
|
|
4700
|
+
joy: "happy",
|
|
4701
|
+
glad: "happy",
|
|
4702
|
+
pleased: "happy",
|
|
4703
|
+
warm: "affectionate",
|
|
4704
|
+
thrilled: "excited",
|
|
4705
|
+
eager: "enthusiastic",
|
|
4706
|
+
ecstatic: "elated",
|
|
4707
|
+
euphoric: "elated",
|
|
4708
|
+
mad: "angry",
|
|
4709
|
+
furious: "angry",
|
|
4710
|
+
annoyed: "frustrated",
|
|
4711
|
+
irritated: "frustrated",
|
|
4712
|
+
agitated: "frustrated",
|
|
4713
|
+
shocked: "surprised",
|
|
4714
|
+
astonished: "amazed",
|
|
4715
|
+
wonder: "amazed",
|
|
4716
|
+
worried: "anxious",
|
|
4717
|
+
nervous: "anxious",
|
|
4718
|
+
afraid: "scared",
|
|
4719
|
+
alarmed: "panicked",
|
|
4720
|
+
unsure: "hesitant",
|
|
4721
|
+
uncertain: "hesitant",
|
|
4722
|
+
doubtful: "skeptical",
|
|
4723
|
+
suspicious: "skeptical",
|
|
4724
|
+
thoughtful: "contemplative",
|
|
4725
|
+
focused: "determined",
|
|
4726
|
+
serious: "determined",
|
|
4727
|
+
playful: "flirtatious",
|
|
4728
|
+
teasing: "flirtatious",
|
|
4729
|
+
ironic: "sarcastic",
|
|
4730
|
+
cheeky: "sarcastic",
|
|
4731
|
+
thankful: "grateful",
|
|
4732
|
+
sorry: "apologetic",
|
|
4733
|
+
down: "sad",
|
|
4734
|
+
melancholic: "sad",
|
|
4735
|
+
gloomy: "sad",
|
|
4736
|
+
peaceful: "calm",
|
|
4737
|
+
serene: "calm",
|
|
4738
|
+
relaxed: "calm",
|
|
4739
|
+
sleepy: "tired"
|
|
4740
|
+
};
|
|
4741
|
+
var NONVERBAL = { laughter: "laughter", laughs: "laughter", laugh: "laughter", laughing: "laughter" };
|
|
4742
|
+
function normalizeEmotion(raw) {
|
|
4743
|
+
const k = raw.trim().toLowerCase();
|
|
4744
|
+
if (VALID.has(k)) return k;
|
|
4745
|
+
return ALIASES[k] ?? null;
|
|
4746
|
+
}
|
|
4747
|
+
function resolveTag(raw) {
|
|
4748
|
+
const k = raw.trim().toLowerCase();
|
|
4749
|
+
if (NONVERBAL[k]) return { kind: "nonverbal", value: NONVERBAL[k] };
|
|
4750
|
+
const e = normalizeEmotion(k);
|
|
4751
|
+
return e ? { kind: "emotion", value: e } : null;
|
|
4752
|
+
}
|
|
4753
|
+
var TAG_RE = /\[([a-zA-Z][a-zA-Z ]{0,24})\]/g;
|
|
4754
|
+
var PARTIAL_RE = /\[[a-zA-Z ]*$/;
|
|
4755
|
+
var cartesiaTag = (t) => t.kind === "nonverbal" ? `[${t.value}]` : `<emotion value="${t.value}"/>`;
|
|
4756
|
+
var EmotionStream = class {
|
|
4757
|
+
constructor(show = true) {
|
|
4758
|
+
this.show = show;
|
|
4759
|
+
}
|
|
4760
|
+
show;
|
|
4761
|
+
buf = "";
|
|
4762
|
+
pending = null;
|
|
4763
|
+
feed(delta) {
|
|
4764
|
+
this.buf += delta;
|
|
4765
|
+
return this.drain(false);
|
|
4766
|
+
}
|
|
4767
|
+
flush() {
|
|
4768
|
+
return this.drain(true);
|
|
4769
|
+
}
|
|
4770
|
+
drain(final) {
|
|
4771
|
+
let body = this.buf;
|
|
4772
|
+
if (!final) {
|
|
4773
|
+
const p = body.match(PARTIAL_RE);
|
|
4774
|
+
if (p) {
|
|
4775
|
+
this.buf = p[0];
|
|
4776
|
+
body = body.slice(0, body.length - p[0].length);
|
|
4777
|
+
} else this.buf = "";
|
|
4778
|
+
} else this.buf = "";
|
|
4779
|
+
let speech = "", display = "", prose = "", last = 0;
|
|
4780
|
+
TAG_RE.lastIndex = 0;
|
|
4781
|
+
for (let m = TAG_RE.exec(body); m; m = TAG_RE.exec(body)) {
|
|
4782
|
+
this.emit(body.slice(last, m.index), (s, d, p) => {
|
|
4783
|
+
speech += s;
|
|
4784
|
+
display += d;
|
|
4785
|
+
prose += p;
|
|
4786
|
+
});
|
|
4787
|
+
const tag = resolveTag(m[1]);
|
|
4788
|
+
if (tag) {
|
|
4789
|
+
this.pending = tag;
|
|
4790
|
+
if (this.show) display += m[0];
|
|
4791
|
+
} else log9.debug(`dropping unknown emotion tag ${m[0]}`);
|
|
4792
|
+
last = m.index + m[0].length;
|
|
4793
|
+
}
|
|
4794
|
+
this.emit(body.slice(last), (s, d, p) => {
|
|
4795
|
+
speech += s;
|
|
4796
|
+
display += d;
|
|
4797
|
+
prose += p;
|
|
4798
|
+
});
|
|
4799
|
+
return { speech, display, prose };
|
|
4800
|
+
}
|
|
4801
|
+
/** Emit a prose span, flushing any pending tag onto its FRONT (only once real words appear). */
|
|
4802
|
+
emit(text, sink) {
|
|
4803
|
+
if (!text) return;
|
|
4804
|
+
let speech = text;
|
|
4805
|
+
if (this.pending && /[\p{L}\p{N}]/u.test(text)) {
|
|
4806
|
+
speech = cartesiaTag(this.pending) + text;
|
|
4807
|
+
this.pending = null;
|
|
4808
|
+
}
|
|
4809
|
+
sink(speech, text, text);
|
|
4810
|
+
}
|
|
4811
|
+
};
|
|
4812
|
+
|
|
4651
4813
|
// src/voice/spokenSplitter.ts
|
|
4652
4814
|
var OPEN = "<spoken>";
|
|
4653
4815
|
var CLOSE = "</spoken>";
|
|
@@ -4721,7 +4883,7 @@ var SpokenSplitter = class {
|
|
|
4721
4883
|
};
|
|
4722
4884
|
|
|
4723
4885
|
// src/duplex.ts
|
|
4724
|
-
var
|
|
4886
|
+
var log10 = forComponent("DuplexAgent");
|
|
4725
4887
|
function describeCall(call) {
|
|
4726
4888
|
const v = call.args && Object.values(call.args).find((x) => typeof x === "string" && x.trim());
|
|
4727
4889
|
const hint = v ? ` (${String(v).replace(/\s+/g, " ").trim().slice(0, 48)})` : "";
|
|
@@ -4760,6 +4922,9 @@ var DuplexAgentOptions = class {
|
|
|
4760
4922
|
/** Voice register: 'neutral' = clean spoken style; 'conversational' = human-like — fillers,
|
|
4761
4923
|
* backchannels, impulsive first reactions before content (mimics real duplex conversation). */
|
|
4762
4924
|
voiceStyle = "neutral";
|
|
4925
|
+
/** Teach the model to emit inline `[emotion]` tags for Cartesia emotion control. Only set when the
|
|
4926
|
+
* TTS actually speaks them — text-duplex (no TTS) would otherwise print literal tags. */
|
|
4927
|
+
emotionTags = false;
|
|
4763
4928
|
/** Awaited BEFORE a worker spawns — open a per-task checkpoint frame, audit, etc.
|
|
4764
4929
|
* (post-spawn would race the worker's first edits). */
|
|
4765
4930
|
onTaskStart;
|
|
@@ -4792,6 +4957,7 @@ var VOICE_SYSTEM_PROMPT = 'You are a spoken voice assistant \u2014 the user HEAR
|
|
|
4792
4957
|
var THINK_GUIDANCE = "\u2022 `Think` \u2014 your brain. A premium reasoning model, FAR more expensive than Act. Reserve it for open-ended architecture/design questions, or a problem Act already FAILED at. ALL implementation work \u2014 coding, refactoring, debugging, edge cases, tests \u2014 goes to Act; Act is highly capable. Never send the same work to both.";
|
|
4793
4958
|
var THINK_DISABLED_GUIDANCE = "(Think tier is not available \u2014 use Act for all escalations.)";
|
|
4794
4959
|
var VOICE_STYLE_CONVERSATIONAL = `Speak like a person in a live conversation, not an assistant reading a script. React first, then deliver: a quick impulsive beat ("oh nice", "hmm, hold on", "ah, got it") before the substance. Use contractions always. Vary sentence length \u2014 some very short. Light fillers and backchannels are fine ("mm-hm", "right", "let's see") but at most one per reply \u2014 never stack them. When you escalate to Act or Think, say it like a human would ("hang on, let me actually dig into that \u2014 gimme a minute") instead of announcing a task. When a result comes back, react to it like you just found out ("okay so \u2014 turns out\u2026"). Match the user's energy: a quick question gets a quick answer \u2014 a few words is a perfectly good turn. Prefer a short answer plus an offer ("want the details?") over covering everything. Never narrate your own mechanics (no "I will now act", no task ids out loud).`;
|
|
4960
|
+
var EMOTION_TAGS_GUIDANCE = `EMOTION: your voice is synthesized with emotion control. Prefix a sentence with an inline [emotion] tag, placed directly before the sentence it colors, to shape how it is spoken. Use it ONLY when the emotion genuinely fits the words (it amplifies real feeling, it cannot fake it) \u2014 do not tag every sentence; reserve it for moments that carry feeling, and vary which one you use. You may also drop [laughter] for a natural laugh. Available emotions: ${EMOTIONS.join(", ")}.`;
|
|
4795
4961
|
var DuplexAgent = class _DuplexAgent {
|
|
4796
4962
|
options;
|
|
4797
4963
|
voice;
|
|
@@ -4861,7 +5027,7 @@ var DuplexAgent = class _DuplexAgent {
|
|
|
4861
5027
|
...new Set(workerToolNames.filter((n) => n.startsWith("mcp__")).map((n) => n.slice(5).split("__")[0]))
|
|
4862
5028
|
];
|
|
4863
5029
|
const workerMcp = mcpNames.length ? `, and it can use these MCP servers: ${[...new Set(mcpNames)].join(", ")}` + (mcpNames.some((n) => /browser/i.test(n)) ? ' \u2014 including driving a REAL browser (open tabs, navigate, click, screenshot), so answer "yes" if asked whether you can control/drive a browser and route an actual browse to Act' : "") : "";
|
|
4864
|
-
const prompt = VOICE_SYSTEM_PROMPT.replace("{{MEMORY_SLOT}}", memSlot).replace("{{THINK_SLOT}}", thinkSlot).replace("{{WORKER_WEB}}", workerWeb + workerMcp) + (o.voiceStyle === "conversational" ? "\n" + VOICE_STYLE_CONVERSATIONAL : "") + `
|
|
5030
|
+
const prompt = VOICE_SYSTEM_PROMPT.replace("{{MEMORY_SLOT}}", memSlot).replace("{{THINK_SLOT}}", thinkSlot).replace("{{WORKER_WEB}}", workerWeb + workerMcp) + (o.voiceStyle === "conversational" ? "\n" + VOICE_STYLE_CONVERSATIONAL : "") + (o.emotionTags ? "\n" + EMOTION_TAGS_GUIDANCE : "") + `
|
|
4865
5031
|
Today's date: ${(/* @__PURE__ */ new Date()).toDateString()}.`;
|
|
4866
5032
|
const tools = [
|
|
4867
5033
|
...o.reflexOptions?.tools ?? [],
|
|
@@ -4885,7 +5051,7 @@ Today's date: ${(/* @__PURE__ */ new Date()).toDateString()}.`;
|
|
|
4885
5051
|
const m = this.reflexBuf.match(RESERVED_EVENT_MARKER) ?? this.reflexBuf.match(RESERVED_EVENT_OPENER);
|
|
4886
5052
|
if (m) {
|
|
4887
5053
|
this.fabricationCut = true;
|
|
4888
|
-
|
|
5054
|
+
log10.warn(`reflex fabricated a [task \u2026] event in its spoken stream \u2014 cutting it (kept ${m.index} chars)`);
|
|
4889
5055
|
const safe = this.reflexBuf.slice(this.reflexForwarded, m.index);
|
|
4890
5056
|
if (!safe) return;
|
|
4891
5057
|
if (safe.trim()) this.spokeThisTurn = true;
|
|
@@ -4987,7 +5153,7 @@ Today's date: ${(/* @__PURE__ */ new Date()).toDateString()}.`;
|
|
|
4987
5153
|
try {
|
|
4988
5154
|
await this.voice.send(fallback ? "[reminder] You said nothing to the user this turn. Tell them, in ONE short spoken sentence, what just happened \u2014 no tools." : dispatched ? "[reminder] You dispatched a task but said nothing to the user. Say ONE short spoken acknowledgement now \u2014 no tools." : "[reminder] You said nothing to the user this turn. Give your ONE short spoken reply now \u2014 no tools.");
|
|
4989
5155
|
} catch (e) {
|
|
4990
|
-
|
|
5156
|
+
log10.warn(`ack nudge failed: ${e instanceof Error ? e.message : e}`);
|
|
4991
5157
|
} finally {
|
|
4992
5158
|
this.nudging = false;
|
|
4993
5159
|
}
|
|
@@ -5014,6 +5180,19 @@ Today's date: ${(/* @__PURE__ */ new Date()).toDateString()}.`;
|
|
|
5014
5180
|
rec.controller.abort();
|
|
5015
5181
|
return `Task ${rec.id} (${rec.label}) cancelled.`;
|
|
5016
5182
|
}
|
|
5183
|
+
/** Barge-in: the user took the floor while task(s) were running. Suppress those tasks' remaining SPOKEN
|
|
5184
|
+
* delivery so a superseded topic never talks over the new one (the debt-after-jokes regression). The
|
|
5185
|
+
* tasks keep running and still fold their result into the transcript — recoverable, just not spoken.
|
|
5186
|
+
* Returns the parked ids (for logging). Does NOT cancel: that's a deliberate reflex/user action. */
|
|
5187
|
+
parkInFlightDeliveries() {
|
|
5188
|
+
const parked = [];
|
|
5189
|
+
for (const rec of this.tasks.values())
|
|
5190
|
+
if (rec.status === "running" && !rec.deliveryParked) {
|
|
5191
|
+
rec.deliveryParked = true;
|
|
5192
|
+
parked.push(rec.id);
|
|
5193
|
+
}
|
|
5194
|
+
return parked;
|
|
5195
|
+
}
|
|
5017
5196
|
/** Resolve when all queued voice turns AND all in-flight worker tasks have settled (tests, graceful shutdown). */
|
|
5018
5197
|
async idle() {
|
|
5019
5198
|
while (true) {
|
|
@@ -5066,7 +5245,7 @@ Today's date: ${(/* @__PURE__ */ new Date()).toDateString()}.`;
|
|
|
5066
5245
|
buildBrief(brief, tier = "act", deliver = true) {
|
|
5067
5246
|
const recent = this.voice.transcript.filter((m) => (m.role === "user" || m.role === "assistant") && contentText(m.content).trim()).slice(-this.options.excerptTurns).map((m) => `${m.role}: ${contentText(m.content)}`).join("\n");
|
|
5068
5247
|
const verify = tier === "act" ? "\n\nBefore reporting done: re-read what you changed and check it against EVERY requirement above \u2014 fix any gap first. Your report is trusted without review." : "";
|
|
5069
|
-
const deliverContract = deliver ? "\n\n## DELIVER (spoken delivery)\nYou are reporting back to a user who is LISTENING. Stream your work normally \u2014 your prose is the written work record and detail, and is NOT spoken. Wrap anything the user should HEAR in <spoken>\u2026</spoken> tags. LEAD WITH the actual content they asked for: if they asked for a specific piece of content \u2014 a value, a name, the actual lines, the writing itself \u2014 that content goes INSIDE the <spoken> tags, not a remark about it. Your FIRST <spoken> segment is substantive \u2014 never a greeting or an acknowledgement (the front-end has already acked; do not double-ack). Keep spoken text concise and natural for the ear: short sentences, no markdown." : "";
|
|
5248
|
+
const deliverContract = deliver ? "\n\n## DELIVER (spoken delivery)\nYou are reporting back to a user who is LISTENING. Stream your work normally \u2014 your prose is the written work record and detail, and is NOT spoken. Wrap anything the user should HEAR in <spoken>\u2026</spoken> tags. LEAD WITH the actual content they asked for: if they asked for a specific piece of content \u2014 a value, a name, the actual lines, the writing itself \u2014 that content goes INSIDE the <spoken> tags, not a remark about it. Your FIRST <spoken> segment is substantive \u2014 never a greeting or an acknowledgement (the front-end has already acked; do not double-ack). Keep spoken text concise and natural for the ear: short sentences, no markdown." + (this.options.emotionTags ? " Inside <spoken>, you may prefix a sentence with an inline [emotion] tag (e.g. [excited], [curious]) to color how it is voiced \u2014 only when it genuinely fits, and vary it; [laughter] gives a natural laugh." : "") : "";
|
|
5070
5249
|
return (recent ? `${brief}
|
|
5071
5250
|
|
|
5072
5251
|
## Recent conversation (for context)
|
|
@@ -5111,7 +5290,7 @@ ${recent}` : brief) + verify + deliverContract;
|
|
|
5111
5290
|
};
|
|
5112
5291
|
const splitter = new SpokenSplitter();
|
|
5113
5292
|
const speak = (seg) => {
|
|
5114
|
-
if (seg) o.host?.notify?.({ kind: "speak_utterance", message: seg });
|
|
5293
|
+
if (seg && !this.tasks.get(id)?.deliveryParked) o.host?.notify?.({ kind: "speak_utterance", message: seg });
|
|
5115
5294
|
};
|
|
5116
5295
|
const coalescer = new SentenceCoalescer();
|
|
5117
5296
|
const feedSpoken = (s) => {
|
|
@@ -5182,7 +5361,7 @@ Another agent just implemented the above. Independently check the CURRENT state
|
|
|
5182
5361
|
this.notify("task_verify", `task ${id}: verifying`, { id });
|
|
5183
5362
|
const cres = await new Agent(checkerOpts).run(checkBrief);
|
|
5184
5363
|
if (cres.finishReason !== "stop") {
|
|
5185
|
-
|
|
5364
|
+
log10.warn(`task ${id}: verify inconclusive (${cres.finishReason})`);
|
|
5186
5365
|
this.notify("task_verify", `task ${id}: verify inconclusive (${cres.finishReason})`, { id, finishReason: cres.finishReason });
|
|
5187
5366
|
}
|
|
5188
5367
|
const sum = (a = 0, b = 0) => a + b;
|
|
@@ -5318,7 +5497,7 @@ Another agent just implemented the above. Independently check the CURRENT state
|
|
|
5318
5497
|
rec.status = "done";
|
|
5319
5498
|
rec.result = res.text;
|
|
5320
5499
|
const incomplete = res.finishReason !== "stop";
|
|
5321
|
-
|
|
5500
|
+
log10.verbose(`task ${id} done (${res.steps} steps${incomplete ? `, INCOMPLETE: ${res.finishReason}` : ""})`);
|
|
5322
5501
|
this.notify("task_done", `task ${id} (${rec.label}) completed`, {
|
|
5323
5502
|
id,
|
|
5324
5503
|
text: res.text,
|
|
@@ -5332,9 +5511,9 @@ Another agent just implemented the above. Independently check the CURRENT state
|
|
|
5332
5511
|
return this.queueRevoice(this.integrationPrompt(rec, "incomplete", res.text, res.finishReason), true);
|
|
5333
5512
|
}
|
|
5334
5513
|
const tail = rec.splitter?.flush();
|
|
5335
|
-
if (tail?.spoken) this.options.host?.notify?.({ kind: "speak_utterance", message: tail.spoken });
|
|
5514
|
+
if (tail?.spoken && !rec.deliveryParked) this.options.host?.notify?.({ kind: "speak_utterance", message: tail.spoken });
|
|
5336
5515
|
if (res.text.trim()) this.voice.transcript.push({ role: "assistant", content: res.text });
|
|
5337
|
-
if (!rec.splitter?.spokeAny && res.text.trim())
|
|
5516
|
+
if (!rec.splitter?.spokeAny && res.text.trim() && !rec.deliveryParked)
|
|
5338
5517
|
this.options.host?.notify?.({ kind: "speak_utterance", message: res.text });
|
|
5339
5518
|
}
|
|
5340
5519
|
onWorkerFailed(id, err) {
|
|
@@ -5344,7 +5523,7 @@ Another agent just implemented the above. Independently check the CURRENT state
|
|
|
5344
5523
|
this.dropAsk(rec.id);
|
|
5345
5524
|
rec.status = "error";
|
|
5346
5525
|
rec.result = msg;
|
|
5347
|
-
|
|
5526
|
+
log10.warn(`task ${rec.id} failed: ${msg}`);
|
|
5348
5527
|
this.notify("task_error", `task ${rec.id} (${rec.label}) failed: ${msg}`);
|
|
5349
5528
|
this.queueRevoice(this.integrationPrompt(rec, "error", msg, "error"), true);
|
|
5350
5529
|
}
|
|
@@ -5707,7 +5886,7 @@ init_logging();
|
|
|
5707
5886
|
|
|
5708
5887
|
// src/voice/engine.ts
|
|
5709
5888
|
init_logging();
|
|
5710
|
-
var
|
|
5889
|
+
var log11 = forComponent("VoiceEngine");
|
|
5711
5890
|
var now = () => performance.now();
|
|
5712
5891
|
var forSpeech = (t) => t.replace(/[*_`#]+/g, "").replace(/^[ \t]*[-•]\s+/gm, "").replace(/\s*[\u2013\u2014]\s*/g, ", ").replace(/[\u2010\u2011]/g, "-").replace(/\s*\|\s*/g, ", ").replace(/(\d)\s+%/g, "$1%").replace(/\.{3,}/g, ".");
|
|
5713
5892
|
var VoiceEngineOptions = class {
|
|
@@ -5775,6 +5954,11 @@ var VoiceEngineOptions = class {
|
|
|
5775
5954
|
* speech at all, an audible hiccup. Default OFF: the genuine-gated STT partial is the
|
|
5776
5955
|
* mechanism-correct pause trigger; enable only if barge-in onset feels sluggish in a clean-AEC room. */
|
|
5777
5956
|
overlapEnergyHold = false;
|
|
5957
|
+
/** Map inline `[emotion]` tags (emitted by the model, prompt-taught) into Cartesia inline emotion
|
|
5958
|
+
* tags in the spoken transcript (sonic-3 stitches the prosody). false = strip them silently. */
|
|
5959
|
+
emotions = true;
|
|
5960
|
+
/** Show the `[emotion]` tags in the on-screen echo (debug). false = hide (spoken-only). */
|
|
5961
|
+
showEmotions = false;
|
|
5778
5962
|
};
|
|
5779
5963
|
var VoiceEngine = class _VoiceEngine {
|
|
5780
5964
|
options;
|
|
@@ -5820,6 +6004,9 @@ var VoiceEngine = class _VoiceEngine {
|
|
|
5820
6004
|
// Central speech queue (above the TTS context): complete worker utterances serialize into ONE
|
|
5821
6005
|
// playback stream, one-at-a-time, never splicing into the live reflex's open utterance.
|
|
5822
6006
|
uttQueue = [];
|
|
6007
|
+
// Per-turn emotion-tag parser (reset on beginSpeech) — converts `[emotion]` → Cartesia inline tags
|
|
6008
|
+
// for TTS, tracks tag-free prose for echo discrimination, and surfaces display text for the screen.
|
|
6009
|
+
emo = null;
|
|
5823
6010
|
constructor(options) {
|
|
5824
6011
|
this.options = { ...new VoiceEngineOptions(), ...options };
|
|
5825
6012
|
const o = this.options;
|
|
@@ -5837,7 +6024,7 @@ var VoiceEngine = class _VoiceEngine {
|
|
|
5837
6024
|
this.stt.onLevel = (rms) => this.handleLevel(rms);
|
|
5838
6025
|
await Promise.all([this.tts.connect(), this.stt.start()]);
|
|
5839
6026
|
this.setState("listening");
|
|
5840
|
-
|
|
6027
|
+
log11.debug(`voice I/O up (${this.stt.usingAec ? "AEC" : "heuristic echo"} capture)`);
|
|
5841
6028
|
}
|
|
5842
6029
|
get usingAec() {
|
|
5843
6030
|
return this.stt.usingAec;
|
|
@@ -5846,6 +6033,10 @@ var VoiceEngine = class _VoiceEngine {
|
|
|
5846
6033
|
setBargeIn(on) {
|
|
5847
6034
|
this.options.bargeIn = on;
|
|
5848
6035
|
}
|
|
6036
|
+
/** Show/hide the `[emotion]` debug tags in the echo (next turn's stream picks it up). */
|
|
6037
|
+
setShowEmotions(on) {
|
|
6038
|
+
this.options.showEmotions = on;
|
|
6039
|
+
}
|
|
5849
6040
|
idleWaiters = [];
|
|
5850
6041
|
setState(s) {
|
|
5851
6042
|
if (this.state === s) return;
|
|
@@ -5877,6 +6068,7 @@ var VoiceEngine = class _VoiceEngine {
|
|
|
5877
6068
|
this.ctxOpen = true;
|
|
5878
6069
|
this.spokeDeltas = false;
|
|
5879
6070
|
this.reply = "";
|
|
6071
|
+
this.emo = this.options.emotions ? new EmotionStream(this.options.showEmotions) : null;
|
|
5880
6072
|
this.echoWords = new Set(this.words(this.prevReply));
|
|
5881
6073
|
this.tts.newContext();
|
|
5882
6074
|
if (ack && this.options.ackPhrase) {
|
|
@@ -5887,21 +6079,31 @@ var VoiceEngine = class _VoiceEngine {
|
|
|
5887
6079
|
if (!this.turnStartAt) this.turnStartAt = now();
|
|
5888
6080
|
this.setState("thinking");
|
|
5889
6081
|
}
|
|
6082
|
+
/** Feed a spoken delta. Returns the on-screen echo text (emotion tags shown/hidden per config) so the
|
|
6083
|
+
* host renders the SAME stream that was parsed for TTS — no second, state-doubling parse. */
|
|
5890
6084
|
speakDelta(text) {
|
|
5891
|
-
if (this.interrupted) return;
|
|
6085
|
+
if (this.interrupted) return "";
|
|
5892
6086
|
if (!this.speaking || !this.ctxOpen) this.beginSpeech();
|
|
5893
|
-
this.
|
|
6087
|
+
const { speech, display, prose } = this.emo ? this.emo.feed(text) : { speech: text, display: text, prose: text };
|
|
6088
|
+
this.reply += prose;
|
|
5894
6089
|
for (const w of this.words(this.reply)) this.echoWords.add(w);
|
|
5895
|
-
this.tts.speak(forSpeech(
|
|
5896
|
-
if (!this.spokeDeltas && this.turnStartAt)
|
|
6090
|
+
this.tts.speak(forSpeech(speech), true);
|
|
6091
|
+
if (!this.spokeDeltas && this.turnStartAt) log11.debug(`ttft: ${Math.round(now() - this.turnStartAt)}ms`);
|
|
5897
6092
|
this.spokeDeltas = true;
|
|
5898
6093
|
this.setState("speaking");
|
|
6094
|
+
return display;
|
|
5899
6095
|
}
|
|
5900
6096
|
/** close the spoken turn (idempotent); stays audible until ALL audio arrived AND playback drains */
|
|
5901
6097
|
endSpeech() {
|
|
5902
6098
|
this.interrupted = false;
|
|
5903
6099
|
if (!this.speaking) return;
|
|
5904
6100
|
this.ctxOpen = false;
|
|
6101
|
+
if (this.emo) {
|
|
6102
|
+
const t = this.emo.flush();
|
|
6103
|
+
this.emo = null;
|
|
6104
|
+
if (t.prose) this.reply += t.prose;
|
|
6105
|
+
if (t.speech) this.tts.speak(forSpeech(t.speech), true);
|
|
6106
|
+
}
|
|
5905
6107
|
if (this.reply) this.prevReply = this.reply;
|
|
5906
6108
|
const settle = () => {
|
|
5907
6109
|
if (this.ctxOpen) {
|
|
@@ -5914,7 +6116,7 @@ var VoiceEngine = class _VoiceEngine {
|
|
|
5914
6116
|
}
|
|
5915
6117
|
this.drainTimer = null;
|
|
5916
6118
|
this.speaking = false;
|
|
5917
|
-
if (this.turnStartAt)
|
|
6119
|
+
if (this.turnStartAt) log11.debug(`turn: ${Math.round(now() - this.turnStartAt)}ms (incl. playback)`);
|
|
5918
6120
|
this.echoUntil = now() + 2500;
|
|
5919
6121
|
if (!this.usingAec) this.stt.reset();
|
|
5920
6122
|
this.setState("listening");
|
|
@@ -6106,7 +6308,7 @@ var VoiceEngine = class _VoiceEngine {
|
|
|
6106
6308
|
this.pendingUtt = this.mergeUtterance(this.pendingUtt, text);
|
|
6107
6309
|
if (this.pendingTimer) clearTimeout(this.pendingTimer);
|
|
6108
6310
|
if (this.options.incompleteMergeMs && this.looksIncomplete(this.pendingUtt)) {
|
|
6109
|
-
|
|
6311
|
+
log11.verbose(`hold: incomplete utterance "${this.pendingUtt.slice(-40)}"`);
|
|
6110
6312
|
this.options.onHold();
|
|
6111
6313
|
if (this.options.holdFiller && !this.speaking) {
|
|
6112
6314
|
this.beginSpeech();
|
|
@@ -6205,7 +6407,7 @@ async function resolveAuth(auth) {
|
|
|
6205
6407
|
}
|
|
6206
6408
|
|
|
6207
6409
|
// src/voice/soniox.ts
|
|
6208
|
-
var
|
|
6410
|
+
var log12 = forComponent("SonioxSTT");
|
|
6209
6411
|
var now2 = () => performance.now();
|
|
6210
6412
|
var SonioxSTTOptions = class {
|
|
6211
6413
|
auth = "";
|
|
@@ -6274,9 +6476,9 @@ var SonioxSTT = class {
|
|
|
6274
6476
|
this.ws.onmessage = (ev) => this.handle(JSON.parse(String(ev.data)));
|
|
6275
6477
|
this.ws.onclose = (ev) => {
|
|
6276
6478
|
if (this.stopped) return;
|
|
6277
|
-
|
|
6479
|
+
log12.warn(`soniox ws closed (${ev.code} ${ev.reason || ""}) \u2014 reconnecting`);
|
|
6278
6480
|
this.reset();
|
|
6279
|
-
this.connectWs().catch((e) =>
|
|
6481
|
+
this.connectWs().catch((e) => log12.error(`soniox reconnect failed: ${e.message}`));
|
|
6280
6482
|
};
|
|
6281
6483
|
}
|
|
6282
6484
|
async start() {
|
|
@@ -6286,7 +6488,7 @@ var SonioxSTT = class {
|
|
|
6286
6488
|
this.endpointTimer = setInterval(() => {
|
|
6287
6489
|
const combined = (this.finalText + this.partialText).trim();
|
|
6288
6490
|
if (!combined || now2() - this.lastChangeAt < this.options.silenceEndpointMs) return;
|
|
6289
|
-
if (this.firstTokenAt)
|
|
6491
|
+
if (this.firstTokenAt) log12.debug(`stt: ${Math.round(now2() - this.firstTokenAt)}ms first-token\u2192silence-endpoint, "${combined.slice(0, 60)}"`);
|
|
6290
6492
|
this.reset();
|
|
6291
6493
|
this.onUtterance(combined, now2());
|
|
6292
6494
|
}, 120);
|
|
@@ -6298,7 +6500,7 @@ var SonioxSTT = class {
|
|
|
6298
6500
|
if (this.stopped) return;
|
|
6299
6501
|
const ref = this.lastChunkAt || this.startedChunksAt;
|
|
6300
6502
|
if (now2() - ref > noAudioMs) {
|
|
6301
|
-
|
|
6503
|
+
log12.error(`stt: no mic audio for >${Math.round(noAudioMs / 1e3)}s \u2014 capture device stopped delivering`);
|
|
6302
6504
|
this.onFatal("microphone stopped delivering audio (try a different input device, e.g. AirPods, or check System Settings \u2192 Sound \u2192 Input)");
|
|
6303
6505
|
this.stop();
|
|
6304
6506
|
}
|
|
@@ -6318,7 +6520,7 @@ var SonioxSTT = class {
|
|
|
6318
6520
|
});
|
|
6319
6521
|
}
|
|
6320
6522
|
handle(m) {
|
|
6321
|
-
if (m.error_message) return
|
|
6523
|
+
if (m.error_message) return log12.error(`soniox: ${m.error_message}`);
|
|
6322
6524
|
let endpoint = false;
|
|
6323
6525
|
for (const t of m.tokens ?? []) {
|
|
6324
6526
|
if (t.text === "<end>") endpoint = true;
|
|
@@ -6334,7 +6536,7 @@ var SonioxSTT = class {
|
|
|
6334
6536
|
this.onPartial(combined);
|
|
6335
6537
|
if (endpoint && this.finalText.trim()) {
|
|
6336
6538
|
const utterance = this.finalText.trim();
|
|
6337
|
-
if (this.firstTokenAt)
|
|
6539
|
+
if (this.firstTokenAt) log12.debug(`stt: ${Math.round(now2() - this.firstTokenAt)}ms first-token\u2192endpoint, "${utterance.slice(0, 60)}"`);
|
|
6338
6540
|
this.reset();
|
|
6339
6541
|
this.onUtterance(utterance, now2());
|
|
6340
6542
|
}
|
|
@@ -6357,7 +6559,7 @@ var SonioxSTT = class {
|
|
|
6357
6559
|
|
|
6358
6560
|
// src/voice/cartesia.ts
|
|
6359
6561
|
init_logging();
|
|
6360
|
-
var
|
|
6562
|
+
var log13 = forComponent("CartesiaTTS");
|
|
6361
6563
|
var now3 = () => performance.now();
|
|
6362
6564
|
var CartesiaTTSOptions = class {
|
|
6363
6565
|
auth = "";
|
|
@@ -6407,9 +6609,9 @@ var CartesiaTTS = class _CartesiaTTS {
|
|
|
6407
6609
|
this.ws.onerror = (e) => rej(new Error(`cartesia ws: ${e.message || "connect failed"}`));
|
|
6408
6610
|
});
|
|
6409
6611
|
this.ws.onclose = (ev) => {
|
|
6410
|
-
|
|
6612
|
+
log13.warn(`cartesia ws closed (${ev.code} ${ev.reason || ""})`);
|
|
6411
6613
|
if (!this.closed) {
|
|
6412
|
-
this.connecting = this.doConnect().catch((e) =>
|
|
6614
|
+
this.connecting = this.doConnect().catch((e) => log13.error(`cartesia reconnect failed: ${e.message}`));
|
|
6413
6615
|
}
|
|
6414
6616
|
};
|
|
6415
6617
|
this.ws.onmessage = (ev) => {
|
|
@@ -6431,11 +6633,11 @@ var CartesiaTTS = class _CartesiaTTS {
|
|
|
6431
6633
|
this.down = true;
|
|
6432
6634
|
this.downAt = now3();
|
|
6433
6635
|
this.consecutiveOk = 0;
|
|
6434
|
-
|
|
6636
|
+
log13.warn(`TTS circuit breaker open \u2014 ${this.consecutiveErrors} consecutive errors, switching to text-only`);
|
|
6435
6637
|
this.onDone();
|
|
6436
6638
|
this.startProbe();
|
|
6437
6639
|
} else if (!this.down) {
|
|
6438
|
-
|
|
6640
|
+
log13.warn(`cartesia: ${JSON.stringify(m)}`);
|
|
6439
6641
|
}
|
|
6440
6642
|
}
|
|
6441
6643
|
};
|
|
@@ -6449,7 +6651,7 @@ var CartesiaTTS = class _CartesiaTTS {
|
|
|
6449
6651
|
this.consecutiveOk = 0;
|
|
6450
6652
|
this.stopProbe();
|
|
6451
6653
|
const downMs = this.downAt ? now3() - this.downAt : 0;
|
|
6452
|
-
(downMs < 2e3 ?
|
|
6654
|
+
(downMs < 2e3 ? log13.debug : log13.info)(`TTS recovered${downMs ? ` (down ${downMs}ms)` : ""}`);
|
|
6453
6655
|
}
|
|
6454
6656
|
/** Ensure the WS is open before sending — reconnects if idle-closed. */
|
|
6455
6657
|
async ensureConnected() {
|