@livx.cc/agentx 0.96.17 → 0.97.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{Agent-DdhD1pGw.d.ts → Agent-B_JD31Zx.d.ts} +1 -1
- package/dist/cli.d.ts +2 -2
- package/dist/cli.js +166 -21
- package/dist/cli.js.map +1 -1
- package/dist/index.d.ts +50 -11
- package/dist/index.js +134 -18
- package/dist/index.js.map +1 -1
- package/dist/{mcp-CnzmQ8JE.d.ts → mcp-BZcizHav.d.ts} +1 -1
- package/dist/mcp.client.d.ts +2 -2
- package/dist/{tools-DtpN8Agv.d.ts → tools-DmrqMJcI.d.ts} +3 -0
- package/dist/tools.shell.d.ts +1 -1
- package/dist/tools.shell.js.map +1 -1
- package/package.json +2 -1
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import { IFilesystem } from '@livx.cc/wcli/core';
|
|
2
|
-
import { M as Message, H as HostBridge, A as AgentTool, C as ChatLike, e as MessageContent } from './tools-
|
|
2
|
+
import { M as Message, H as HostBridge, A as AgentTool, C as ChatLike, e as MessageContent } from './tools-DmrqMJcI.js';
|
|
3
3
|
|
|
4
4
|
/**
|
|
5
5
|
* Hooks — deterministic interception points around tool execution, run by the
|
package/dist/cli.d.ts
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
#!/usr/bin/env bun
|
|
2
|
-
import { H as Hooks, h as RunResult, R as ReasoningEffort, A as Agent } from './Agent-
|
|
2
|
+
import { H as Hooks, h as RunResult, R as ReasoningEffort, A as Agent } from './Agent-B_JD31Zx.js';
|
|
3
3
|
import { IFilesystem } from '@livx.cc/wcli/core';
|
|
4
|
-
import { M as Message, c as ContentPart, e as MessageContent } from './tools-
|
|
4
|
+
import { M as Message, c as ContentPart, e as MessageContent } from './tools-DmrqMJcI.js';
|
|
5
5
|
|
|
6
6
|
/**
|
|
7
7
|
* On-disk session store for the CLI: each conversation is one JSON file at
|
package/dist/cli.js
CHANGED
|
@@ -4581,6 +4581,57 @@ function digestRun(messages, maxChars) {
|
|
|
4581
4581
|
// src/duplex.ts
|
|
4582
4582
|
import { MemFilesystem as MemFilesystem2 } from "@livx.cc/wcli/core";
|
|
4583
4583
|
init_logging();
|
|
4584
|
+
|
|
4585
|
+
// src/voice/spokenSplitter.ts
|
|
4586
|
+
var OPEN = "<spoken>";
|
|
4587
|
+
var CLOSE = "</spoken>";
|
|
4588
|
+
var SpokenSplitter = class {
|
|
4589
|
+
buf = "";
|
|
4590
|
+
inSpoken = false;
|
|
4591
|
+
/** True once any spoken char has ever been emitted (drives the no-spoken fallback). */
|
|
4592
|
+
spokeAny = false;
|
|
4593
|
+
/** Feed a delta; returns the spoken/detail spans completed by this chunk (either may be ''). */
|
|
4594
|
+
feed(delta) {
|
|
4595
|
+
this.buf += delta;
|
|
4596
|
+
return this.drain(false);
|
|
4597
|
+
}
|
|
4598
|
+
/** Drain any buffered partial. A trailing `<…` that never completed a tag is emitted as detail. */
|
|
4599
|
+
flush() {
|
|
4600
|
+
return this.drain(true);
|
|
4601
|
+
}
|
|
4602
|
+
drain(final) {
|
|
4603
|
+
let spoken = "";
|
|
4604
|
+
let detail = "";
|
|
4605
|
+
while (this.buf.length) {
|
|
4606
|
+
const tag = this.inSpoken ? CLOSE : OPEN;
|
|
4607
|
+
const idx = this.buf.indexOf(tag);
|
|
4608
|
+
if (idx >= 0) {
|
|
4609
|
+
const text2 = this.buf.slice(0, idx);
|
|
4610
|
+
if (this.inSpoken) spoken += text2;
|
|
4611
|
+
else detail += text2;
|
|
4612
|
+
this.buf = this.buf.slice(idx + tag.length);
|
|
4613
|
+
this.inSpoken = !this.inSpoken;
|
|
4614
|
+
continue;
|
|
4615
|
+
}
|
|
4616
|
+
const lt = this.buf.lastIndexOf("<");
|
|
4617
|
+
const holdStart = lt >= 0 && tag.startsWith(this.buf.slice(lt)) ? lt : this.buf.length;
|
|
4618
|
+
const text = this.buf.slice(0, holdStart);
|
|
4619
|
+
if (this.inSpoken) spoken += text;
|
|
4620
|
+
else detail += text;
|
|
4621
|
+
this.buf = this.buf.slice(holdStart);
|
|
4622
|
+
break;
|
|
4623
|
+
}
|
|
4624
|
+
if (final && this.buf) {
|
|
4625
|
+
if (this.inSpoken) spoken += this.buf;
|
|
4626
|
+
else detail += this.buf;
|
|
4627
|
+
this.buf = "";
|
|
4628
|
+
}
|
|
4629
|
+
if (spoken.trim()) this.spokeAny = true;
|
|
4630
|
+
return { spoken, detail };
|
|
4631
|
+
}
|
|
4632
|
+
};
|
|
4633
|
+
|
|
4634
|
+
// src/duplex.ts
|
|
4584
4635
|
var log8 = forComponent("DuplexAgent");
|
|
4585
4636
|
function describeCall(call) {
|
|
4586
4637
|
const v = call.args && Object.values(call.args).find((x) => typeof x === "string" && x.trim());
|
|
@@ -4648,7 +4699,7 @@ var DuplexAgentOptions = class {
|
|
|
4648
4699
|
};
|
|
4649
4700
|
var RESERVED_EVENT_MARKER = /\[task\b[^\]\n]*\b(?:completed|failed|progress|asks)\b/i;
|
|
4650
4701
|
var RESERVED_EVENT_OPENER = /\[\s*task\b/i;
|
|
4651
|
-
var VOICE_SYSTEM_PROMPT = 'You are a spoken voice assistant \u2014 the user HEARS everything you say. Use short sentences. One idea per sentence. No markdown, no bullet lists, no code blocks, no headings, no emoji.\nThis holds even when asked to "print", "list", "show", or "make a table" \u2014 there is no screen for the spoken channel. Speak it as flowing prose ("Tuesday is half a meter, Wednesday a bit less\u2026"), or if they truly need it on screen, route it to Act to render. Never emit dashes or pipes into speech.\nKeep turns SHORT \u2014 one to three sentences, then stop. Never lecture, enumerate cases, or add caveats unprompted. Conversation is a fast exchange: give the one thing asked, and let the user pull more if they want it.\nYou have three cognitive tiers \u2014 like a human brain:\n\u2022 YOU (reflex) \u2014 instant, lightweight. Handle greetings, simple questions, status checks, QuickLook.\n\u2022 `Act` \u2014 your hands. A background worker with its own configured tools and access to the user\'s environment (files and shell{{WORKER_WEB}}). Use for reading, editing, searching, running tasks, building \u2014 any real work.\n{{THINK_SLOT}}\nWhen you are unsure whether you can do or access something, do NOT assume and do NOT claim a capability you have not confirmed. To check what you can do, QuickLook `capabilities` (instant \u2014 it lists your worker\'s real tools) and answer from that. Never promise an ability that is not in your capabilities; if it is not there, tell the user plainly you can\'t. To actually DO real work, call `Act`. When the user mentions their project, folder, files, or environment ("this project", "the current folder", "my code"), call `Act` IMMEDIATELY \u2014 do not ask for paths or details the worker can discover itself. Never pretend to have done the work or invent results \u2014 the worker\'s report is your only source.\nYou cannot mute the microphone or stop voice capture yourself \u2014 no tool does it. If the user asks you to stop listening or turn the voice off, never claim you did: tell them to say exactly "voice off" (handled by the app directly), or type /voice.\nYou are NOT a knowledge base. For any question whose answer needs SPECIFIC verifiable facts you do not already have in hand \u2014 how to build/configure/implement something, exact API, library, entitlement, command or option names, current events, or particular numbers, dates, or names \u2014 do NOT answer from your own memory: you will confidently make things up (a fake API, a wrong entitlement, an event that did not happen). Route it to `Act`, which can search and verify, and speak only what its report says. Answer inline ONLY for general conversation, chit-chat, and trivia you are sure of, or facts you can see via QuickLook. When elaborating on a completed task ("tell me more", "the gist"), stay strictly within what that result actually said \u2014 if the user asks for something the result did not cover, that is NEW information: dispatch `Act`, do not improvise.\nALWAYS react before you work: the FIRST thing in your turn is a brief spoken acknowledgement of what you heard and what you are about to do ("got it \u2014 opening that now", "sure, let me pull it up", "okay, checking"). NEVER call a tool (Act, Think, QuickLook) silently \u2014 the user must hear you react before you go quiet to work. After dispatching Act or Think, that same one short sentence IS your turn \u2014 end it and do not wait for the result.\
|
|
4702
|
+
var VOICE_SYSTEM_PROMPT = 'You are a spoken voice assistant \u2014 the user HEARS everything you say. Use short sentences. One idea per sentence. No markdown, no bullet lists, no code blocks, no headings, no emoji.\nThis holds even when asked to "print", "list", "show", or "make a table" \u2014 there is no screen for the spoken channel. Speak it as flowing prose ("Tuesday is half a meter, Wednesday a bit less\u2026"), or if they truly need it on screen, route it to Act to render. Never emit dashes or pipes into speech.\nKeep turns SHORT \u2014 one to three sentences, then stop. Never lecture, enumerate cases, or add caveats unprompted. Conversation is a fast exchange: give the one thing asked, and let the user pull more if they want it.\nYou have three cognitive tiers \u2014 like a human brain:\n\u2022 YOU (reflex) \u2014 instant, lightweight. Handle greetings, simple questions, status checks, QuickLook.\n\u2022 `Act` \u2014 your hands. A background worker with its own configured tools and access to the user\'s environment (files and shell{{WORKER_WEB}}). Use for reading, editing, searching, running tasks, building \u2014 any real work.\n{{THINK_SLOT}}\nWhen you are unsure whether you can do or access something, do NOT assume and do NOT claim a capability you have not confirmed. To check what you can do, QuickLook `capabilities` (instant \u2014 it lists your worker\'s real tools) and answer from that. Never promise an ability that is not in your capabilities; if it is not there, tell the user plainly you can\'t. To actually DO real work, call `Act`. When the user mentions their project, folder, files, or environment ("this project", "the current folder", "my code"), call `Act` IMMEDIATELY \u2014 do not ask for paths or details the worker can discover itself. Never pretend to have done the work or invent results \u2014 the worker\'s report is your only source.\nYou cannot mute the microphone or stop voice capture yourself \u2014 no tool does it. If the user asks you to stop listening or turn the voice off, never claim you did: tell them to say exactly "voice off" (handled by the app directly), or type /voice.\nYou are NOT a knowledge base. For any question whose answer needs SPECIFIC verifiable facts you do not already have in hand \u2014 how to build/configure/implement something, exact API, library, entitlement, command or option names, current events, or particular numbers, dates, or names \u2014 do NOT answer from your own memory: you will confidently make things up (a fake API, a wrong entitlement, an event that did not happen). Route it to `Act`, which can search and verify, and speak only what its report says. Answer inline ONLY for general conversation, chit-chat, and trivia you are sure of, or facts you can see via QuickLook. When elaborating on a completed task ("tell me more", "the gist"), stay strictly within what that result actually said \u2014 if the user asks for something the result did not cover, that is NEW information: dispatch `Act`, do not improvise.\nALWAYS react before you work: the FIRST thing in your turn is a brief spoken acknowledgement of what you heard and what you are about to do ("got it \u2014 opening that now", "sure, let me pull it up", "okay, checking"). NEVER call a tool (Act, Think, QuickLook) silently \u2014 the user must hear you react before you go quiet to work. After dispatching Act or Think, that same one short sentence IS your turn \u2014 end it and do not wait for the result.\nA completed task speaks its OWN result to the user (the worker voices what matters as it finishes) \u2014 you do NOT re-voice clean task results. A FAILED or INCOMPLETE task still arrives as a "[task t1 failed] \u2026" event for you to handle. The completed result stays in YOUR context \u2014 it is yours to draw on. When the user follows up ("tell me more", "what else", "and?"), answer FROM that result first: you already have the detail, so elaborate on what you have. Do NOT spawn a fresh worker to re-search or re-gather what you were just handed. Re-dispatch ONLY when genuinely new information is needed \u2014 e.g. the user wants the full contents of a SPECIFIC source, which is one WebFetch of that URL, not a brand-new search. "[task t1 progress] \u2026" events are interim status, NOT results \u2014 give at most a half-sentence aside ("still on it \u2014 running tests now") and end your turn. Never present progress as a finished result.\nCRITICAL: while a task is still running you have NO answer yet \u2014 never state a specific result of any kind (a number, size, count, name, path, or value). The real answer arrives ONLY in the "[task \u2026 completed]" event; inventing one meanwhile (a made-up disk size, commit count, etc.) is a serious error. Until then, only acknowledge and wait.\nNever read raw file paths, diffs, or code aloud verbatim.\nDo NOT end every turn with the same canned offer ("want a rundown?", "want the steps?"). Offer once at most; if the user pushes back, repeats themselves, or sounds unsatisfied ("you know what I mean?", "think deeper", "are you sure?"), do NOT re-offer the same thing \u2014 change approach: dispatch `Act`/`Think` to actually dig in, or ask one concrete clarifying question. Repeating a non-answer is worse than silence.\n"[task t1 asks] \u2026" events are QUESTIONS from a background task \u2014 relay to the user in your own words, short, then end your turn. When the user answers, call `AnswerTask` with that id and their answer. NEVER answer on the user\'s behalf for permissions or risky operations; if their reply is ambiguous, confirm first.\nIf the user\'s message sounds INCOMPLETE \u2014 trailing off mid-sentence, a fragment that needs more context ("and then we", "but the problem is"), hesitation fillers ("uh", "um") \u2014 call `Hold` instead of answering. This keeps listening for the rest of their thought. Only respond with substance when you have a complete question or request.\nDispatch discipline: send ONE self-contained task per request \u2014 a single worker with the full brief beats several workers with fragments (each worker starts fresh and re-discovers context). NEVER dispatch a worker just to read files or gather information \u2014 workers explore and discover context themselves; pass on what you already know and let one worker do the whole job. Split into parallel tasks only when the user asks for genuinely independent things. When a task completes, report its result and stop \u2014 do NOT dispatch follow-up work (verification, polish, extras) the user did not ask for, unless the report itself signals failure or doubt.\nDo not fire a second Act/Think for work already in flight, and NEVER spawn a second task to re-count, cross-check, or verify a result a worker already gave you \u2014 trust its answer; a single question gets ONE task. Call `TaskStatus` at most ONCE per turn; if a task is still running, just say "still on it" and end the turn \u2014 never poll it again and again in a loop. Use `CancelTask` when the user asks to stop something.\nPRIORITY: when the user says goodbye or wants to end/finish/wrap up the session ("ok bye", "that\'s all", "let\'s finish", "let\'s end", "goodnight", "exit", "wrap up"), call `ExitSession` IMMEDIATELY \u2014 do not act, do not check status, just exit.\nFor TRIVIAL instant lookups only \u2014 current time, git branch, listing a folder, peeking at a small file, or checking your own `capabilities`/tools \u2014 use `QuickLook` (instant, no task). Whenever the user asks what you can do or whether you have some ability, QuickLook `capabilities` and answer from that \u2014 never guess. Anything requiring searching, reasoning, running commands, or editing goes through `Act`.\n{{MEMORY_SLOT}}\nUser messages may arrive via speech-to-text and can carry transcription artifacts \u2014 odd words, cut-offs, homophones ("for you" vs "folder"). Read for INTENT, not surface text. If a message seems garbled, surprising, or only half-parses, do NOT guess an action or improvise content from it \u2014 briefly confirm what they meant ("did you mean\u2026?") and wait. A one-line confirm beats a confident wrong answer or an invented response to a request you did not actually understand.';
|
|
4652
4703
|
var THINK_GUIDANCE = "\u2022 `Think` \u2014 your brain. A premium reasoning model, FAR more expensive than Act. Reserve it for open-ended architecture/design questions, or a problem Act already FAILED at. ALL implementation work \u2014 coding, refactoring, debugging, edge cases, tests \u2014 goes to Act; Act is highly capable. Never send the same work to both.";
|
|
4653
4704
|
var THINK_DISABLED_GUIDANCE = "(Think tier is not available \u2014 use Act for all escalations.)";
|
|
4654
4705
|
var VOICE_STYLE_CONVERSATIONAL = `Speak like a person in a live conversation, not an assistant reading a script. React first, then deliver: a quick impulsive beat ("oh nice", "hmm, hold on", "ah, got it") before the substance. Use contractions always. Vary sentence length \u2014 some very short. Light fillers and backchannels are fine ("mm-hm", "right", "let's see") but at most one per reply \u2014 never stack them. When you escalate to Act or Think, say it like a human would ("hang on, let me actually dig into that \u2014 gimme a minute") instead of announcing a task. When a result comes back, react to it like you just found out ("okay so \u2014 turns out\u2026"). Match the user's energy: a quick question gets a quick answer \u2014 a few words is a perfectly good turn. Prefer a short answer plus an offer ("want the details?") over covering everything. Never narrate your own mechanics (no "I will now act", no task ids out loud).`;
|
|
@@ -4923,13 +4974,14 @@ Today's date: ${(/* @__PURE__ */ new Date()).toDateString()}.`;
|
|
|
4923
4974
|
* Act briefs get a self-verify footer — the worker's report is trusted without review, so it
|
|
4924
4975
|
* must check its own work before reporting (nearly free under prompt caching; measured honest:
|
|
4925
4976
|
* it does NOT fix one-shot logic bugs — see mind/10). Think tasks are pure reasoning — no footer. */
|
|
4926
|
-
buildBrief(brief, tier = "act") {
|
|
4977
|
+
buildBrief(brief, tier = "act", deliver = true) {
|
|
4927
4978
|
const recent = this.voice.transcript.filter((m) => (m.role === "user" || m.role === "assistant") && contentText(m.content).trim()).slice(-this.options.excerptTurns).map((m) => `${m.role}: ${contentText(m.content)}`).join("\n");
|
|
4928
4979
|
const verify = tier === "act" ? "\n\nBefore reporting done: re-read what you changed and check it against EVERY requirement above \u2014 fix any gap first. Your report is trusted without review." : "";
|
|
4980
|
+
const deliverContract = deliver ? "\n\n## DELIVER (spoken delivery)\nYou are reporting back to a user who is LISTENING. Stream your work normally \u2014 your prose is the written work record and detail, and is NOT spoken. Wrap anything the user should HEAR in <spoken>\u2026</spoken> tags. LEAD WITH the actual content they asked for: if they asked for a specific piece of content \u2014 a value, a name, the actual lines, the writing itself \u2014 that content goes INSIDE the <spoken> tags, not a remark about it. Your FIRST <spoken> segment is substantive \u2014 never a greeting or an acknowledgement (the front-end has already acked; do not double-ack). Keep spoken text concise and natural for the ear: short sentences, no markdown." : "";
|
|
4929
4981
|
return (recent ? `${brief}
|
|
4930
4982
|
|
|
4931
4983
|
## Recent conversation (for context)
|
|
4932
|
-
${recent}` : brief) + verify;
|
|
4984
|
+
${recent}` : brief) + verify + deliverContract;
|
|
4933
4985
|
}
|
|
4934
4986
|
/** Spawn a detached worker for task `id`; its settlement notifies + enqueues the re-voice turn. */
|
|
4935
4987
|
spawnWorker(id, label, briefText, tier, brief, followUp) {
|
|
@@ -4968,7 +5020,22 @@ ${recent}` : brief) + verify;
|
|
|
4968
5020
|
const a = await this.parkQuestion(id, `${q2.question}${opts}`);
|
|
4969
5021
|
return a || "(no answer from the user \u2014 use your best judgment and note the assumption)";
|
|
4970
5022
|
};
|
|
4971
|
-
const
|
|
5023
|
+
const splitter = new SpokenSplitter();
|
|
5024
|
+
const speak = (seg) => {
|
|
5025
|
+
if (seg) o.host?.notify?.({ kind: "speak_utterance", message: seg });
|
|
5026
|
+
};
|
|
5027
|
+
const askBridge = o.askRelay ? { ask: relayAsk } : o.host?.ask ? { ask: (q2) => o.host.ask(q2) } : {};
|
|
5028
|
+
const workerHost = {
|
|
5029
|
+
...askBridge,
|
|
5030
|
+
notify: (ev) => {
|
|
5031
|
+
if (ev?.kind === "text_delta" && typeof ev.message === "string") {
|
|
5032
|
+
const { spoken, detail } = splitter.feed(ev.message);
|
|
5033
|
+
speak(spoken);
|
|
5034
|
+
if (detail.trim()) pushTail(detail.trim());
|
|
5035
|
+
return;
|
|
5036
|
+
}
|
|
5037
|
+
}
|
|
5038
|
+
};
|
|
4972
5039
|
const agentOpts = {
|
|
4973
5040
|
ai: o.ai,
|
|
4974
5041
|
fs: o.fs,
|
|
@@ -4978,13 +5045,16 @@ ${recent}` : brief) + verify;
|
|
|
4978
5045
|
// Recompute providerOptions for THIS worker's model (after tierOpts so it wins over any inherited
|
|
4979
5046
|
// main-template value) — prevents cursor-only cwd/cursorSession leaking onto an anthropic worker.
|
|
4980
5047
|
providerOptions: o.providerOptionsFor?.(tierModel),
|
|
4981
|
-
|
|
5048
|
+
stream: true,
|
|
5049
|
+
// worker streams text_delta so the splitter can extract <spoken> live (after tierOpts: never overridden off)
|
|
5050
|
+
host: workerHost,
|
|
5051
|
+
// carries BOTH ask AND the <spoken>-splitting notify
|
|
4982
5052
|
...hooks ? { hooks } : {},
|
|
4983
5053
|
signal: controller.signal
|
|
4984
5054
|
// shared with the checker so a cancel tears down both
|
|
4985
5055
|
};
|
|
4986
|
-
const promise = new Agent(agentOpts).run(briefText).then((res) => this.maybeVerify(id,
|
|
4987
|
-
this.tasks.set(id, { id, label, status: "running", controller, promise, tail, brief, followUp });
|
|
5056
|
+
const promise = new Agent(agentOpts).run(briefText).then((res) => this.maybeVerify(id, brief, res, tier, agentOpts, askBridge)).then((res) => this.onWorkerSettled(id, res)).catch((err2) => this.onWorkerFailed(id, err2));
|
|
5057
|
+
this.tasks.set(id, { id, label, status: "running", controller, promise, tail, brief, followUp, splitter });
|
|
4988
5058
|
if (this.tasks.size > this.options.maxTaskRecords)
|
|
4989
5059
|
for (const [tid, rec] of this.tasks) {
|
|
4990
5060
|
if (this.tasks.size <= this.options.maxTaskRecords) break;
|
|
@@ -4996,15 +5066,20 @@ ${recent}` : brief) + verify;
|
|
|
4996
5066
|
* on the shared fs automatically (workers write fs directly, no overlay), so grading sees the
|
|
4997
5067
|
* corrected state. Bounded to ONE pass. Off unless `verifyActTasks`; never runs for think/failed/
|
|
4998
5068
|
* cancelled tasks. Usage is merged so /cost reflects the real (worker + checker) spend. */
|
|
4999
|
-
async maybeVerify(id,
|
|
5069
|
+
async maybeVerify(id, brief, res, tier, agentOpts, askBridge) {
|
|
5000
5070
|
if (!this.options.verifyActTasks || tier !== "act" || res.finishReason !== "stop") return res;
|
|
5001
5071
|
if (this.tasks.get(id)?.status === "cancelled") return res;
|
|
5002
|
-
const
|
|
5072
|
+
const { stream: _stream, host: _host, ...restOpts } = agentOpts;
|
|
5073
|
+
const checkerOpts = {
|
|
5074
|
+
...restOpts,
|
|
5075
|
+
...askBridge.ask ? { host: { ask: askBridge.ask } } : {}
|
|
5076
|
+
};
|
|
5077
|
+
const checkBrief = `${this.buildBrief(brief, tier, false)}
|
|
5003
5078
|
|
|
5004
5079
|
## VERIFY MODE
|
|
5005
5080
|
Another agent just implemented the above. Independently check the CURRENT state of the files against EVERY requirement. Fix any gap you find. If everything is already correct, make NO changes \u2014 do not refactor or improve \u2014 and report "verified".`;
|
|
5006
5081
|
this.notify("task_verify", `task ${id}: verifying`, { id });
|
|
5007
|
-
const cres = await new Agent(
|
|
5082
|
+
const cres = await new Agent(checkerOpts).run(checkBrief);
|
|
5008
5083
|
if (cres.finishReason !== "stop") {
|
|
5009
5084
|
log8.warn(`task ${id}: verify inconclusive (${cres.finishReason})`);
|
|
5010
5085
|
this.notify("task_verify", `task ${id}: verify inconclusive (${cres.finishReason})`, { id, finishReason: cres.finishReason });
|
|
@@ -5095,12 +5170,14 @@ Another agent just implemented the above. Independently check the CURRENT state
|
|
|
5095
5170
|
dropAsk(id) {
|
|
5096
5171
|
this.pendingAsks.get(id)?.resolve("");
|
|
5097
5172
|
}
|
|
5098
|
-
/** Build the INTEGRATION TURN prompt for a settled worker
|
|
5099
|
-
*
|
|
5100
|
-
*
|
|
5173
|
+
/** Build the INTEGRATION TURN prompt for a NON-CLEAN settled worker (early stop / failure). A clean
|
|
5174
|
+
* success never reaches here — it streams its own `<spoken>` delivery during the run. For a partial
|
|
5175
|
+
* or failed result the outcome re-enters the reflex as a decision (like a tool_result flowing back
|
|
5176
|
+
* into a normal agent loop): the reflex evaluates the outcome against the original intent and chooses
|
|
5177
|
+
* what to do next.
|
|
5101
5178
|
*
|
|
5102
5179
|
* Decision branches (the reflex acts on them with EXISTING tools — no new surface):
|
|
5103
|
-
* • accept →
|
|
5180
|
+
* • accept → SPEAK the (partial) result plainly — don't dress a failure up as success.
|
|
5104
5181
|
* • escalate → call `Think` with the SAME brief — only when Act failed/stalled AND a Think tier
|
|
5105
5182
|
* exists AND this task wasn't already a follow-up (one hop max). Wires the dead
|
|
5106
5183
|
* "Reserve Think for a problem Act already FAILED at" promise.
|
|
@@ -5111,8 +5188,6 @@ Another agent just implemented the above. Independently check the CURRENT state
|
|
|
5111
5188
|
* failed-revoice fallback still fire, and the per-event transcript markers stay intact. */
|
|
5112
5189
|
integrationPrompt(rec, outcome, body, finishReason) {
|
|
5113
5190
|
const opener = outcome === "error" ? `[task ${rec.id} failed]` : `[task ${rec.id} completed]`;
|
|
5114
|
-
if (outcome === "ok")
|
|
5115
|
-
return `${opener} ${body}`;
|
|
5116
5191
|
const underCap = this.autoEscalations < _DuplexAgent.MAX_AUTO_ESCALATIONS;
|
|
5117
5192
|
const canEscalate = (outcome === "error" || outcome === "incomplete") && underCap;
|
|
5118
5193
|
const hasThink = this.options.thinkModel !== false;
|
|
@@ -5152,7 +5227,14 @@ Another agent just implemented the above. Independently check the CURRENT state
|
|
|
5152
5227
|
steps: res.steps,
|
|
5153
5228
|
toolCalls: res.messages.filter((m) => m.role === "tool").length
|
|
5154
5229
|
});
|
|
5155
|
-
|
|
5230
|
+
if (incomplete) {
|
|
5231
|
+
return this.queueRevoice(this.integrationPrompt(rec, "incomplete", res.text, res.finishReason), true);
|
|
5232
|
+
}
|
|
5233
|
+
const tail = rec.splitter?.flush();
|
|
5234
|
+
if (tail?.spoken) this.options.host?.notify?.({ kind: "speak_utterance", message: tail.spoken });
|
|
5235
|
+
if (res.text.trim()) this.voice.transcript.push({ role: "assistant", content: res.text });
|
|
5236
|
+
if (!rec.splitter?.spokeAny && res.text.trim())
|
|
5237
|
+
this.options.host?.notify?.({ kind: "speak_utterance", message: res.text });
|
|
5156
5238
|
}
|
|
5157
5239
|
onWorkerFailed(id, err2) {
|
|
5158
5240
|
this.failTask(this.tasks.get(id), err2 instanceof Error ? err2.message : String(err2));
|
|
@@ -5500,6 +5582,11 @@ var VoiceEngineOptions = class {
|
|
|
5500
5582
|
* as a barge and abort the fresh turn (live: mid-sentence self-interruption + steps=1→steps=0 double
|
|
5501
5583
|
* abort). Short enough that a genuine immediate barge ("no wait—") still lands right after. */
|
|
5502
5584
|
bargeGraceMs = 600;
|
|
5585
|
+
/** Barge-in (talk over the assistant to interrupt). true = full-duplex (needs echo cancellation, or
|
|
5586
|
+
* the assistant's own TTS bleeds back and self-interrupts). false = HALF-DUPLEX: the engine is deaf
|
|
5587
|
+
* while audible (speaking + drain tail), so echo can never become a phantom turn — the right mode
|
|
5588
|
+
* when there's no AEC (e.g. the non-VPIO mic fallback) and no headphones. Cost: can't interrupt. */
|
|
5589
|
+
bargeIn = true;
|
|
5503
5590
|
/** Filler phrase spoken when holding for an incomplete utterance ('' disables). */
|
|
5504
5591
|
holdFiller = "";
|
|
5505
5592
|
/** Called when the engine holds an incomplete utterance (host can render a visual cue). */
|
|
@@ -5571,6 +5658,9 @@ var VoiceEngine = class _VoiceEngine {
|
|
|
5571
5658
|
resumeTimer = null;
|
|
5572
5659
|
turnStartAt = 0;
|
|
5573
5660
|
// timestamp when the current turn began (for TTFT logging)
|
|
5661
|
+
// Central speech queue (above the TTS context): complete worker utterances serialize into ONE
|
|
5662
|
+
// playback stream, one-at-a-time, never splicing into the live reflex's open utterance.
|
|
5663
|
+
uttQueue = [];
|
|
5574
5664
|
constructor(options) {
|
|
5575
5665
|
this.options = { ...new VoiceEngineOptions(), ...options };
|
|
5576
5666
|
const o = this.options;
|
|
@@ -5593,6 +5683,10 @@ var VoiceEngine = class _VoiceEngine {
|
|
|
5593
5683
|
get usingAec() {
|
|
5594
5684
|
return this.stt.usingAec;
|
|
5595
5685
|
}
|
|
5686
|
+
/** Flip barge-in at runtime (e.g. the mic fell back to non-VPIO → go half-duplex so echo can't leak). */
|
|
5687
|
+
setBargeIn(on) {
|
|
5688
|
+
this.options.bargeIn = on;
|
|
5689
|
+
}
|
|
5596
5690
|
idleWaiters = [];
|
|
5597
5691
|
setState(s) {
|
|
5598
5692
|
if (this.state === s) return;
|
|
@@ -5665,6 +5759,7 @@ var VoiceEngine = class _VoiceEngine {
|
|
|
5665
5759
|
this.echoUntil = now() + 2500;
|
|
5666
5760
|
if (!this.usingAec) this.stt.reset();
|
|
5667
5761
|
this.setState("listening");
|
|
5762
|
+
if (this.uttQueue.length) this.pumpQueue();
|
|
5668
5763
|
};
|
|
5669
5764
|
const drainThenSettle = () => {
|
|
5670
5765
|
if (this.drainTimer) clearTimeout(this.drainTimer);
|
|
@@ -5691,8 +5786,27 @@ var VoiceEngine = class _VoiceEngine {
|
|
|
5691
5786
|
this.speakDelta(text);
|
|
5692
5787
|
this.endSpeech();
|
|
5693
5788
|
}
|
|
5789
|
+
/** Enqueue a COMPLETE worker utterance (already-split spoken text) onto the central speech queue.
|
|
5790
|
+
* If nothing is currently speaking it plays immediately; otherwise it queues and plays after the
|
|
5791
|
+
* current utterance fully ends (settle → pumpQueue) — never spliced into an open reflex utterance. */
|
|
5792
|
+
enqueueUtterance(text) {
|
|
5793
|
+
if (!text || !text.trim()) return;
|
|
5794
|
+
this.uttQueue.push(text);
|
|
5795
|
+
if (!this.speaking) this.pumpQueue();
|
|
5796
|
+
}
|
|
5797
|
+
/** Play the next queued worker utterance as its own one-shot turn (begin → delta → end). Drives the
|
|
5798
|
+
* next one from the settle completion (endSpeech), so utterances serialize without overlap. */
|
|
5799
|
+
pumpQueue() {
|
|
5800
|
+
if (this.speaking) return;
|
|
5801
|
+
const text = this.uttQueue.shift();
|
|
5802
|
+
if (text == null) return;
|
|
5803
|
+
this.beginSpeech();
|
|
5804
|
+
this.speakDelta(text);
|
|
5805
|
+
this.endSpeech();
|
|
5806
|
+
}
|
|
5694
5807
|
/** barge-in: stop audio NOW, cancel generation, reset for the user's utterance */
|
|
5695
5808
|
interrupt() {
|
|
5809
|
+
this.uttQueue = [];
|
|
5696
5810
|
if (!this.speaking && !this.drainTimer) return;
|
|
5697
5811
|
if (this.drainTimer) {
|
|
5698
5812
|
clearTimeout(this.drainTimer);
|
|
@@ -5714,6 +5828,7 @@ var VoiceEngine = class _VoiceEngine {
|
|
|
5714
5828
|
this.setState("listening");
|
|
5715
5829
|
}
|
|
5716
5830
|
stop() {
|
|
5831
|
+
this.uttQueue = [];
|
|
5717
5832
|
if (this.resumeTimer) clearTimeout(this.resumeTimer);
|
|
5718
5833
|
if (this.pendingTimer) clearTimeout(this.pendingTimer);
|
|
5719
5834
|
if (this.drainTimer) clearTimeout(this.drainTimer);
|
|
@@ -5744,6 +5859,7 @@ var VoiceEngine = class _VoiceEngine {
|
|
|
5744
5859
|
}
|
|
5745
5860
|
handlePartial(text) {
|
|
5746
5861
|
if (this.speaking) {
|
|
5862
|
+
if (!this.options.bargeIn) return;
|
|
5747
5863
|
if (now() < this.bargeGraceUntil) {
|
|
5748
5864
|
if (!this.echoActive() || (this.usingAec ? this.genuine(text) : this.novelWords(text).length >= 1)) this.options.onPartial(text);
|
|
5749
5865
|
return;
|
|
@@ -5819,7 +5935,7 @@ var VoiceEngine = class _VoiceEngine {
|
|
|
5819
5935
|
this.stt.reset();
|
|
5820
5936
|
return;
|
|
5821
5937
|
}
|
|
5822
|
-
if (this.echoActive() && (this.usingAec ? !this.genuine(text) : this.novelWords(text).length < 2)) {
|
|
5938
|
+
if (this.echoActive() && (!this.options.bargeIn || (this.usingAec ? !this.genuine(text) : this.novelWords(text).length < 2))) {
|
|
5823
5939
|
this.stt.reset();
|
|
5824
5940
|
return;
|
|
5825
5941
|
}
|
|
@@ -7273,6 +7389,9 @@ var AecDuplexAudio = class {
|
|
|
7273
7389
|
return this._aec;
|
|
7274
7390
|
}
|
|
7275
7391
|
onFatal;
|
|
7392
|
+
/** Fired once when capture degrades to the non-VPIO (no-AEC) fallback — the host switches to
|
|
7393
|
+
* half-duplex so the assistant's own TTS can't bleed back as a phantom turn. */
|
|
7394
|
+
onDegrade;
|
|
7276
7395
|
proc = null;
|
|
7277
7396
|
stopped = false;
|
|
7278
7397
|
micDenied = false;
|
|
@@ -7290,6 +7409,12 @@ var AecDuplexAudio = class {
|
|
|
7290
7409
|
// --- AudioSource ---
|
|
7291
7410
|
start(onChunk) {
|
|
7292
7411
|
this.onChunk = onChunk;
|
|
7412
|
+
if (process.env.MIC_NO_VPIO === "1") {
|
|
7413
|
+
this.noVpio = true;
|
|
7414
|
+
this.triedFallback = true;
|
|
7415
|
+
this._aec = false;
|
|
7416
|
+
this.onDegrade?.();
|
|
7417
|
+
}
|
|
7293
7418
|
this.spawnHelper();
|
|
7294
7419
|
}
|
|
7295
7420
|
/** (Re)spawn the helper. On the first spawn, arm a fast watchdog: if VPIO delivers NO audio within
|
|
@@ -7326,7 +7451,8 @@ var AecDuplexAudio = class {
|
|
|
7326
7451
|
this.triedFallback = true;
|
|
7327
7452
|
this.noVpio = true;
|
|
7328
7453
|
this._aec = false;
|
|
7329
|
-
log16.warn("mic-aec: VPIO delivered no audio in 2.5s \u2014 falling back to non-VPIO capture (
|
|
7454
|
+
log16.warn("mic-aec: VPIO delivered no audio in 2.5s \u2014 falling back to non-VPIO capture (no AEC \u2192 half-duplex, no barge-in)");
|
|
7455
|
+
this.onDegrade?.();
|
|
7330
7456
|
this.killProc();
|
|
7331
7457
|
this.spawnHelper();
|
|
7332
7458
|
}, 2500);
|
|
@@ -7451,6 +7577,7 @@ var VoiceIO = class extends VoiceEngine {
|
|
|
7451
7577
|
// textless residue pre-pause: opt-in (hiccup source)
|
|
7452
7578
|
});
|
|
7453
7579
|
this.duplexSource = duplex;
|
|
7580
|
+
if (duplex) duplex.onDegrade = () => this.setBargeIn(false);
|
|
7454
7581
|
}
|
|
7455
7582
|
/** Host hook for an unrecoverable audio failure — mic permission denied (duplex source) or no mic
|
|
7456
7583
|
* audio at all (STT watchdog). Routed to whichever can detect it. */
|
|
@@ -11489,6 +11616,23 @@ async function repl(args, ai, cfg, cwd) {
|
|
|
11489
11616
|
...base,
|
|
11490
11617
|
notify(e) {
|
|
11491
11618
|
if (voiceIO && (e.kind === "thinking_delta" || e.kind === "turn_start")) return;
|
|
11619
|
+
if (e.kind === "speak_utterance") {
|
|
11620
|
+
if (voiceIO) {
|
|
11621
|
+
spinner.stop();
|
|
11622
|
+
voiceIO.enqueueUtterance(e.message);
|
|
11623
|
+
editorRef?.suspend();
|
|
11624
|
+
voiceEcho(e.message);
|
|
11625
|
+
voiceEchoEnd();
|
|
11626
|
+
editorRef?.resume();
|
|
11627
|
+
editorRef?.redrawNow();
|
|
11628
|
+
} else {
|
|
11629
|
+
err("\r\x1B[0J" + dim(` \u29BF ${plainLine(e.message)}
|
|
11630
|
+
`));
|
|
11631
|
+
editorRef?.redrawNow();
|
|
11632
|
+
repaintStash();
|
|
11633
|
+
}
|
|
11634
|
+
return;
|
|
11635
|
+
}
|
|
11492
11636
|
if (e.kind === "text_delta" && voiceIO) {
|
|
11493
11637
|
spinner.stop();
|
|
11494
11638
|
voiceIO.speakDelta(e.message);
|
|
@@ -13193,8 +13337,9 @@ ${out}
|
|
|
13193
13337
|
return;
|
|
13194
13338
|
}
|
|
13195
13339
|
const cut = voiceIO.takeInterruptedReply();
|
|
13196
|
-
const note = cut
|
|
13197
|
-
[the user interrupted you mid-speech \u2014 they only heard up to: "\u2026${cut.heard.slice(-80)}". Work any unheard essentials into your reply naturally, only if still relevant.]` :
|
|
13340
|
+
const note = !cut || cut.full.length - cut.heard.length <= 40 ? "" : cut.heard.trim() ? `
|
|
13341
|
+
[the user interrupted you mid-speech \u2014 they only heard up to: "\u2026${cut.heard.slice(-80)}". Work any unheard essentials into your reply naturally, only if still relevant.]` : `
|
|
13342
|
+
[the user interrupted you before hearing any of your previous reply \u2014 none of it landed; do not assume they got it.]`;
|
|
13198
13343
|
if (!/^[!#/]/.test(text.trim())) voiceIO.beginSpeech(true);
|
|
13199
13344
|
err(`\r\x1B[K ${bold(cyan("\u{1F3A4} \u203A"))} ${text}
|
|
13200
13345
|
`);
|