@livx.cc/agentx 0.96.17 → 0.97.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.ts CHANGED
@@ -1,11 +1,11 @@
1
- import { a as AgentOptions, H as Hooks, h as RunResult, A as Agent } from './Agent-DdhD1pGw.js';
2
- export { C as ChatFragment, D as DEFAULT_MUTATING, b as Decision, P as PermissionOptions, c as PermissionPolicy, d as PermissionRule, e as PreToolUseDecision, R as ReasoningEffort, f as RecordingHooks, g as RecordingLifecycle, T as ToolUse, i as ToolUseMeta, j as composeHooks, p as planMode, r as reasoningToChatFragment } from './Agent-DdhD1pGw.js';
1
+ import { a as AgentOptions, H as Hooks, h as RunResult, A as Agent } from './Agent-B_JD31Zx.js';
2
+ export { C as ChatFragment, D as DEFAULT_MUTATING, b as Decision, P as PermissionOptions, c as PermissionPolicy, d as PermissionRule, e as PreToolUseDecision, R as ReasoningEffort, f as RecordingHooks, g as RecordingLifecycle, T as ToolUse, i as ToolUseMeta, j as composeHooks, p as planMode, r as reasoningToChatFragment } from './Agent-B_JD31Zx.js';
3
3
  import { IFilesystem, FileMetadata } from '@livx.cc/wcli/core';
4
4
  export { CommandExecutor, FileMetadata, IFilesystem, IndexedDbFilesystem, MemFilesystem, registerHeadlessCommands } from '@livx.cc/wcli/core';
5
5
  import { BodDB } from '@bod.ee/db';
6
- import { A as AgentTool, C as ChatLike, a as ChatOptions, b as ChatResponse, h as ToolCall, H as HostBridge, U as UserQuestion, e as MessageContent } from './tools-DtpN8Agv.js';
7
- export { c as ContentPart, d as HostEvent, M as Message, R as Role, S as SandboxJobRegistry, f as StreamChunk, T as TodoItem, g as Tool, i as ToolContext, j as bashTool, k as contentText, l as defaultTools, m as editTool, n as exitSessionTool, o as imagePart, p as makeContext, q as makeJobTools, r as readTool, t as toWireTools, s as todoWriteTool, u as toolRegistry, v as toolsByName } from './tools-DtpN8Agv.js';
8
- export { M as McpCall, a as McpImage, b as McpRoute, c as McpRouteResolver, d as McpToolResult, e as McpToolSearchOptions, f as McpToolSpec, g as MountedMcpLike, h as buildMcpCatalog, m as makeLazyMcpToolSearch, i as makeMcpToolSearch, j as makeMcpToolSearchFromMounted, k as mcpToolToAgentTool, l as mcpToolsToAgentTools } from './mcp-CnzmQ8JE.js';
6
+ import { A as AgentTool, C as ChatLike, a as ChatOptions, b as ChatResponse, h as ToolCall, H as HostBridge, U as UserQuestion, e as MessageContent } from './tools-DmrqMJcI.js';
7
+ export { c as ContentPart, d as HostEvent, M as Message, R as Role, S as SandboxJobRegistry, f as StreamChunk, T as TodoItem, g as Tool, i as ToolContext, j as bashTool, k as contentText, l as defaultTools, m as editTool, n as exitSessionTool, o as imagePart, p as makeContext, q as makeJobTools, r as readTool, t as toWireTools, s as todoWriteTool, u as toolRegistry, v as toolsByName } from './tools-DmrqMJcI.js';
8
+ export { M as McpCall, a as McpImage, b as McpRoute, c as McpRouteResolver, d as McpToolResult, e as McpToolSearchOptions, f as McpToolSpec, g as MountedMcpLike, h as buildMcpCatalog, m as makeLazyMcpToolSearch, i as makeMcpToolSearch, j as makeMcpToolSearchFromMounted, k as mcpToolToAgentTool, l as mcpToolsToAgentTools } from './mcp-BZcizHav.js';
9
9
  import * as libx_js_src_modules_log from 'libx.js/src/modules/log';
10
10
  export { log } from 'libx.js/src/modules/log';
11
11
 
@@ -803,6 +803,24 @@ declare function reflectOnRun(o: ReflectOptions): Promise<string | null>;
803
803
  */
804
804
  declare function loadInstructions(fs: IFilesystem, names?: string[]): Promise<string>;
805
805
 
806
+ declare class SpokenSplitter {
807
+ private buf;
808
+ private inSpoken;
809
+ /** True once any spoken char has ever been emitted (drives the no-spoken fallback). */
810
+ spokeAny: boolean;
811
+ /** Feed a delta; returns the spoken/detail spans completed by this chunk (either may be ''). */
812
+ feed(delta: string): {
813
+ spoken: string;
814
+ detail: string;
815
+ };
816
+ /** Drain any buffered partial. A trailing `<…` that never completed a tag is emitted as detail. */
817
+ flush(): {
818
+ spoken: string;
819
+ detail: string;
820
+ };
821
+ private drain;
822
+ }
823
+
806
824
  /**
807
825
  * DuplexAgent — voice-optimized three-tier conversational engine, composed on top of `Agent`.
808
826
  *
@@ -812,8 +830,9 @@ declare function loadInstructions(fs: IFilesystem, names?: string[]): Promise<st
812
830
  * ACT — standard worker (Sonnet-class). Full tools, file access, shell. The hands.
813
831
  * THINK — premium reasoning (Opus-class). Deep analysis, architecture, hard problems. The brain.
814
832
  *
815
- * Workers are spawned per escalation via `Act`/`Think` tools. Results are pushed back as
816
- * `[task <id> completed] …` events and re-voiced by the reflex push, not poll.
833
+ * Workers are spawned per escalation via `Act`/`Think` tools. A clean success streams its own
834
+ * `<spoken>` delivery during the run; an incomplete/failed result is pushed back as a
835
+ * `[task <id> completed|failed] …` event and re-voiced by the reflex as a decision — push, not poll.
817
836
  *
818
837
  * Host events (via the open HostEvent union): the voice agent's standard `text_delta` stream,
819
838
  * plus `task_started` / `task_progress` / `task_done` / `task_error` / `task_cancelled`.
@@ -836,6 +855,9 @@ interface TaskRecord {
836
855
  /** True when this task is ITSELF an automatic follow-up (escalation/re-delegate) of a prior task.
837
856
  * Its integration turn may NOT escalate again — caps auto-follow-up to one hop per original task. */
838
857
  followUp: boolean;
858
+ /** Per-worker `<spoken>` splitter — the worker OWNS delivery: spoken segments stream during its run.
859
+ * Read at settle (spokeAny) to decide the no-spoken fallback. */
860
+ splitter?: SpokenSplitter;
839
861
  }
840
862
  type WorkerTier = 'act' | 'think';
841
863
  declare class DuplexAgentOptions {
@@ -1010,12 +1032,14 @@ declare class DuplexAgent {
1010
1032
  parkQuestion(askId: string, question: string): Promise<string>;
1011
1033
  /** Resolve any question a settling/cancelled task left parked (its answer can no longer matter). */
1012
1034
  private dropAsk;
1013
- /** Build the INTEGRATION TURN prompt for a settled worker. Instead of trust-and-forwarding the raw
1014
- * result, the result re-enters the reflex as a decision (like a tool_result flowing back into a normal
1015
- * agent loop): the reflex evaluates the outcome against the original intent and chooses what to do next.
1035
+ /** Build the INTEGRATION TURN prompt for a NON-CLEAN settled worker (early stop / failure). A clean
1036
+ * success never reaches here it streams its own `<spoken>` delivery during the run. For a partial
1037
+ * or failed result the outcome re-enters the reflex as a decision (like a tool_result flowing back
1038
+ * into a normal agent loop): the reflex evaluates the outcome against the original intent and chooses
1039
+ * what to do next.
1016
1040
  *
1017
1041
  * Decision branches (the reflex acts on them with EXISTING tools — no new surface):
1018
- * • accept → just SPEAK the result to the user (happy path; the only move on a clean success).
1042
+ * • accept → SPEAK the (partial) result plainly don't dress a failure up as success.
1019
1043
  * • escalate → call `Think` with the SAME brief — only when Act failed/stalled AND a Think tier
1020
1044
  * exists AND this task wasn't already a follow-up (one hop max). Wires the dead
1021
1045
  * "Reserve Think for a problem Act already FAILED at" promise.
@@ -1209,6 +1233,11 @@ declare class VoiceEngineOptions {
1209
1233
  * as a barge and abort the fresh turn (live: mid-sentence self-interruption + steps=1→steps=0 double
1210
1234
  * abort). Short enough that a genuine immediate barge ("no wait—") still lands right after. */
1211
1235
  bargeGraceMs: number;
1236
+ /** Barge-in (talk over the assistant to interrupt). true = full-duplex (needs echo cancellation, or
1237
+ * the assistant's own TTS bleeds back and self-interrupts). false = HALF-DUPLEX: the engine is deaf
1238
+ * while audible (speaking + drain tail), so echo can never become a phantom turn — the right mode
1239
+ * when there's no AEC (e.g. the non-VPIO mic fallback) and no headphones. Cost: can't interrupt. */
1240
+ bargeIn: boolean;
1212
1241
  /** Filler phrase spoken when holding for an incomplete utterance ('' disables). */
1213
1242
  holdFiller: string;
1214
1243
  /** Called when the engine holds an incomplete utterance (host can render a visual cue). */
@@ -1266,9 +1295,12 @@ declare class VoiceEngine {
1266
1295
  private lastOverlapPartial;
1267
1296
  private resumeTimer;
1268
1297
  private turnStartAt;
1298
+ private uttQueue;
1269
1299
  constructor(options?: Partial<VoiceEngineOptions>);
1270
1300
  start(): Promise<void>;
1271
1301
  get usingAec(): boolean;
1302
+ /** Flip barge-in at runtime (e.g. the mic fell back to non-VPIO → go half-duplex so echo can't leak). */
1303
+ setBargeIn(on: boolean): void;
1272
1304
  private idleWaiters;
1273
1305
  private setState;
1274
1306
  /** Resolve when the engine is no longer speaking (immediate if already idle). */
@@ -1288,6 +1320,13 @@ declare class VoiceEngine {
1288
1320
  } | null;
1289
1321
  /** Speak a short filler phrase without starting a model turn (stays in listening mode after). */
1290
1322
  speakFiller(text: string): void;
1323
+ /** Enqueue a COMPLETE worker utterance (already-split spoken text) onto the central speech queue.
1324
+ * If nothing is currently speaking it plays immediately; otherwise it queues and plays after the
1325
+ * current utterance fully ends (settle → pumpQueue) — never spliced into an open reflex utterance. */
1326
+ enqueueUtterance(text: string): void;
1327
+ /** Play the next queued worker utterance as its own one-shot turn (begin → delta → end). Drives the
1328
+ * next one from the settle completion (endSpeech), so utterances serialize without overlap. */
1329
+ private pumpQueue;
1291
1330
  /** barge-in: stop audio NOW, cancel generation, reset for the user's utterance */
1292
1331
  interrupt(): void;
1293
1332
  stop(): void;
package/dist/index.js CHANGED
@@ -4647,6 +4647,57 @@ function digestRun(messages, maxChars) {
4647
4647
  // src/duplex.ts
4648
4648
  import { MemFilesystem as MemFilesystem2 } from "@livx.cc/wcli/core";
4649
4649
  init_logging();
4650
+
4651
+ // src/voice/spokenSplitter.ts
4652
+ var OPEN = "<spoken>";
4653
+ var CLOSE = "</spoken>";
4654
+ var SpokenSplitter = class {
4655
+ buf = "";
4656
+ inSpoken = false;
4657
+ /** True once any spoken char has ever been emitted (drives the no-spoken fallback). */
4658
+ spokeAny = false;
4659
+ /** Feed a delta; returns the spoken/detail spans completed by this chunk (either may be ''). */
4660
+ feed(delta) {
4661
+ this.buf += delta;
4662
+ return this.drain(false);
4663
+ }
4664
+ /** Drain any buffered partial. A trailing `<…` that never completed a tag is emitted as detail. */
4665
+ flush() {
4666
+ return this.drain(true);
4667
+ }
4668
+ drain(final) {
4669
+ let spoken = "";
4670
+ let detail = "";
4671
+ while (this.buf.length) {
4672
+ const tag = this.inSpoken ? CLOSE : OPEN;
4673
+ const idx = this.buf.indexOf(tag);
4674
+ if (idx >= 0) {
4675
+ const text2 = this.buf.slice(0, idx);
4676
+ if (this.inSpoken) spoken += text2;
4677
+ else detail += text2;
4678
+ this.buf = this.buf.slice(idx + tag.length);
4679
+ this.inSpoken = !this.inSpoken;
4680
+ continue;
4681
+ }
4682
+ const lt = this.buf.lastIndexOf("<");
4683
+ const holdStart = lt >= 0 && tag.startsWith(this.buf.slice(lt)) ? lt : this.buf.length;
4684
+ const text = this.buf.slice(0, holdStart);
4685
+ if (this.inSpoken) spoken += text;
4686
+ else detail += text;
4687
+ this.buf = this.buf.slice(holdStart);
4688
+ break;
4689
+ }
4690
+ if (final && this.buf) {
4691
+ if (this.inSpoken) spoken += this.buf;
4692
+ else detail += this.buf;
4693
+ this.buf = "";
4694
+ }
4695
+ if (spoken.trim()) this.spokeAny = true;
4696
+ return { spoken, detail };
4697
+ }
4698
+ };
4699
+
4700
+ // src/duplex.ts
4650
4701
  var log9 = forComponent("DuplexAgent");
4651
4702
  function describeCall(call) {
4652
4703
  const v = call.args && Object.values(call.args).find((x) => typeof x === "string" && x.trim());
@@ -4714,7 +4765,7 @@ var DuplexAgentOptions = class {
4714
4765
  };
4715
4766
  var RESERVED_EVENT_MARKER = /\[task\b[^\]\n]*\b(?:completed|failed|progress|asks)\b/i;
4716
4767
  var RESERVED_EVENT_OPENER = /\[\s*task\b/i;
4717
- var VOICE_SYSTEM_PROMPT = 'You are a spoken voice assistant \u2014 the user HEARS everything you say. Use short sentences. One idea per sentence. No markdown, no bullet lists, no code blocks, no headings, no emoji.\nThis holds even when asked to "print", "list", "show", or "make a table" \u2014 there is no screen for the spoken channel. Speak it as flowing prose ("Tuesday is half a meter, Wednesday a bit less\u2026"), or if they truly need it on screen, route it to Act to render. Never emit dashes or pipes into speech.\nKeep turns SHORT \u2014 one to three sentences, then stop. Never lecture, enumerate cases, or add caveats unprompted. Conversation is a fast exchange: give the one thing asked, and let the user pull more if they want it.\nYou have three cognitive tiers \u2014 like a human brain:\n\u2022 YOU (reflex) \u2014 instant, lightweight. Handle greetings, simple questions, status checks, QuickLook.\n\u2022 `Act` \u2014 your hands. A background worker with its own configured tools and access to the user\'s environment (files and shell{{WORKER_WEB}}). Use for reading, editing, searching, running tasks, building \u2014 any real work.\n{{THINK_SLOT}}\nWhen you are unsure whether you can do or access something, do NOT assume and do NOT claim a capability you have not confirmed. To check what you can do, QuickLook `capabilities` (instant \u2014 it lists your worker\'s real tools) and answer from that. Never promise an ability that is not in your capabilities; if it is not there, tell the user plainly you can\'t. To actually DO real work, call `Act`. When the user mentions their project, folder, files, or environment ("this project", "the current folder", "my code"), call `Act` IMMEDIATELY \u2014 do not ask for paths or details the worker can discover itself. Never pretend to have done the work or invent results \u2014 the worker\'s report is your only source.\nYou cannot mute the microphone or stop voice capture yourself \u2014 no tool does it. If the user asks you to stop listening or turn the voice off, never claim you did: tell them to say exactly "voice off" (handled by the app directly), or type /voice.\nYou are NOT a knowledge base. For any question whose answer needs SPECIFIC verifiable facts you do not already have in hand \u2014 how to build/configure/implement something, exact API, library, entitlement, command or option names, current events, or particular numbers, dates, or names \u2014 do NOT answer from your own memory: you will confidently make things up (a fake API, a wrong entitlement, an event that did not happen). Route it to `Act`, which can search and verify, and speak only what its report says. Answer inline ONLY for general conversation, chit-chat, and trivia you are sure of, or facts you can see via QuickLook. When elaborating on a completed task ("tell me more", "the gist"), stay strictly within what that result actually said \u2014 if the user asks for something the result did not cover, that is NEW information: dispatch `Act`, do not improvise.\nALWAYS react before you work: the FIRST thing in your turn is a brief spoken acknowledgement of what you heard and what you are about to do ("got it \u2014 opening that now", "sure, let me pull it up", "okay, checking"). NEVER call a tool (Act, Think, QuickLook) silently \u2014 the user must hear you react before you go quiet to work. After dispatching Act or Think, that same one short sentence IS your turn \u2014 end it and do not wait for the result.\nResults arrive later as events like "[task t1 completed] \u2026" or "[task t1 failed] \u2026". When one arrives, speak the USEFUL gist in one or two short sentences \u2014 the actual answer the user wanted (the headline finding, the key numbers), not the thinnest possible "it\'s done". A forecast \u2192 say it\'s calm AND that it\'s good for swimming but not surf; a count \u2192 say the number. Be brief, but do not drop the substance. DISTILL vs DELIVER \u2014 know which the request wants. When the result is a FACT to extract (a forecast, a count, a status), distill the headline. But when the user wanted specific CONTENT \u2014 a joke, a quote, a name, a definition, the actual lines \u2014 that content IS the deliverable: LEAD WITH IT. Your first words ARE the joke / the quote / the answer itself, before any "got it" or offer. SPEAK the content, never a comment ABOUT it: "why was six afraid of seven? because seven ate nine" \u2014 NOT "those are funny" or "I found a couple". If you did not actually say the joke/quote/answer aloud this turn, you FAILED the request, no matter how friendly the wrapper. A short joke is short \u2014 just say it. NEVER speak as if you already delivered something you did not actually say aloud THIS turn: do not say "those are\u2026", "there you go", or offer "a few MORE" when you never voiced the first one. The on-screen text is invisible to a voice user \u2014 if you did not speak it, they did not get it, so deliver it before you comment on it or offer more. If the result is a LIST (search results, multiple files/matches), the user CANNOT see it \u2014 there is no screen and no numbered menu to point at. Speak the gist: say what you found and name the top one or two by NAME (the source, not "the first one" or a number), then ask plainly if they want more. Never ask them to "pick which one" or reference items by position. The completed result stays in YOUR context \u2014 it is yours to draw on. When the user follows up ("tell me more", "what else", "and?"), answer FROM that result first: you already have the detail, so elaborate on what you have. Do NOT spawn a fresh worker to re-search or re-gather what you were just handed. Re-dispatch ONLY when genuinely new information is needed \u2014 e.g. the user wants the full contents of a SPECIFIC source, which is one WebFetch of that URL, not a brand-new search. "[task t1 progress] \u2026" events are interim status, NOT results \u2014 give at most a half-sentence aside ("still on it \u2014 running tests now") and end your turn. Never present progress as a finished result.\nCRITICAL: while a task is still running you have NO answer yet \u2014 never state a specific result of any kind (a number, size, count, name, path, or value). The real answer arrives ONLY in the "[task \u2026 completed]" event; inventing one meanwhile (a made-up disk size, commit count, etc.) is a serious error. Until then, only acknowledge and wait.\nNever read raw file paths, diffs, or code aloud verbatim.\nDo NOT end every turn with the same canned offer ("want a rundown?", "want the steps?"). Offer once at most; if the user pushes back, repeats themselves, or sounds unsatisfied ("you know what I mean?", "think deeper", "are you sure?"), do NOT re-offer the same thing \u2014 change approach: dispatch `Act`/`Think` to actually dig in, or ask one concrete clarifying question. Repeating a non-answer is worse than silence.\n"[task t1 asks] \u2026" events are QUESTIONS from a background task \u2014 relay to the user in your own words, short, then end your turn. When the user answers, call `AnswerTask` with that id and their answer. NEVER answer on the user\'s behalf for permissions or risky operations; if their reply is ambiguous, confirm first.\nIf the user\'s message sounds INCOMPLETE \u2014 trailing off mid-sentence, a fragment that needs more context ("and then we", "but the problem is"), hesitation fillers ("uh", "um") \u2014 call `Hold` instead of answering. This keeps listening for the rest of their thought. Only respond with substance when you have a complete question or request.\nDispatch discipline: send ONE self-contained task per request \u2014 a single worker with the full brief beats several workers with fragments (each worker starts fresh and re-discovers context). NEVER dispatch a worker just to read files or gather information \u2014 workers explore and discover context themselves; pass on what you already know and let one worker do the whole job. Split into parallel tasks only when the user asks for genuinely independent things. When a task completes, report its result and stop \u2014 do NOT dispatch follow-up work (verification, polish, extras) the user did not ask for, unless the report itself signals failure or doubt.\nDo not fire a second Act/Think for work already in flight, and NEVER spawn a second task to re-count, cross-check, or verify a result a worker already gave you \u2014 trust its answer; a single question gets ONE task. Call `TaskStatus` at most ONCE per turn; if a task is still running, just say "still on it" and end the turn \u2014 never poll it again and again in a loop. Use `CancelTask` when the user asks to stop something.\nPRIORITY: when the user says goodbye or wants to end/finish/wrap up the session ("ok bye", "that\'s all", "let\'s finish", "let\'s end", "goodnight", "exit", "wrap up"), call `ExitSession` IMMEDIATELY \u2014 do not act, do not check status, just exit.\nFor TRIVIAL instant lookups only \u2014 current time, git branch, listing a folder, peeking at a small file, or checking your own `capabilities`/tools \u2014 use `QuickLook` (instant, no task). Whenever the user asks what you can do or whether you have some ability, QuickLook `capabilities` and answer from that \u2014 never guess. Anything requiring searching, reasoning, running commands, or editing goes through `Act`.\n{{MEMORY_SLOT}}\nUser messages may arrive via speech-to-text and can carry transcription artifacts \u2014 odd words, cut-offs, homophones ("for you" vs "folder"). Read for INTENT, not surface text. If a message seems garbled or surprising, briefly confirm what they meant ("did you mean\u2026?") instead of answering the literal words.';
4768
+ var VOICE_SYSTEM_PROMPT = 'You are a spoken voice assistant \u2014 the user HEARS everything you say. Use short sentences. One idea per sentence. No markdown, no bullet lists, no code blocks, no headings, no emoji.\nThis holds even when asked to "print", "list", "show", or "make a table" \u2014 there is no screen for the spoken channel. Speak it as flowing prose ("Tuesday is half a meter, Wednesday a bit less\u2026"), or if they truly need it on screen, route it to Act to render. Never emit dashes or pipes into speech.\nKeep turns SHORT \u2014 one to three sentences, then stop. Never lecture, enumerate cases, or add caveats unprompted. Conversation is a fast exchange: give the one thing asked, and let the user pull more if they want it.\nYou have three cognitive tiers \u2014 like a human brain:\n\u2022 YOU (reflex) \u2014 instant, lightweight. Handle greetings, simple questions, status checks, QuickLook.\n\u2022 `Act` \u2014 your hands. A background worker with its own configured tools and access to the user\'s environment (files and shell{{WORKER_WEB}}). Use for reading, editing, searching, running tasks, building \u2014 any real work.\n{{THINK_SLOT}}\nWhen you are unsure whether you can do or access something, do NOT assume and do NOT claim a capability you have not confirmed. To check what you can do, QuickLook `capabilities` (instant \u2014 it lists your worker\'s real tools) and answer from that. Never promise an ability that is not in your capabilities; if it is not there, tell the user plainly you can\'t. To actually DO real work, call `Act`. When the user mentions their project, folder, files, or environment ("this project", "the current folder", "my code"), call `Act` IMMEDIATELY \u2014 do not ask for paths or details the worker can discover itself. Never pretend to have done the work or invent results \u2014 the worker\'s report is your only source.\nYou cannot mute the microphone or stop voice capture yourself \u2014 no tool does it. If the user asks you to stop listening or turn the voice off, never claim you did: tell them to say exactly "voice off" (handled by the app directly), or type /voice.\nYou are NOT a knowledge base. For any question whose answer needs SPECIFIC verifiable facts you do not already have in hand \u2014 how to build/configure/implement something, exact API, library, entitlement, command or option names, current events, or particular numbers, dates, or names \u2014 do NOT answer from your own memory: you will confidently make things up (a fake API, a wrong entitlement, an event that did not happen). Route it to `Act`, which can search and verify, and speak only what its report says. Answer inline ONLY for general conversation, chit-chat, and trivia you are sure of, or facts you can see via QuickLook. When elaborating on a completed task ("tell me more", "the gist"), stay strictly within what that result actually said \u2014 if the user asks for something the result did not cover, that is NEW information: dispatch `Act`, do not improvise.\nALWAYS react before you work: the FIRST thing in your turn is a brief spoken acknowledgement of what you heard and what you are about to do ("got it \u2014 opening that now", "sure, let me pull it up", "okay, checking"). NEVER call a tool (Act, Think, QuickLook) silently \u2014 the user must hear you react before you go quiet to work. After dispatching Act or Think, that same one short sentence IS your turn \u2014 end it and do not wait for the result.\nA completed task speaks its OWN result to the user (the worker voices what matters as it finishes) \u2014 you do NOT re-voice clean task results. A FAILED or INCOMPLETE task still arrives as a "[task t1 failed] \u2026" event for you to handle. The completed result stays in YOUR context \u2014 it is yours to draw on. When the user follows up ("tell me more", "what else", "and?"), answer FROM that result first: you already have the detail, so elaborate on what you have. Do NOT spawn a fresh worker to re-search or re-gather what you were just handed. Re-dispatch ONLY when genuinely new information is needed \u2014 e.g. the user wants the full contents of a SPECIFIC source, which is one WebFetch of that URL, not a brand-new search. "[task t1 progress] \u2026" events are interim status, NOT results \u2014 give at most a half-sentence aside ("still on it \u2014 running tests now") and end your turn. Never present progress as a finished result.\nCRITICAL: while a task is still running you have NO answer yet \u2014 never state a specific result of any kind (a number, size, count, name, path, or value). The real answer arrives ONLY in the "[task \u2026 completed]" event; inventing one meanwhile (a made-up disk size, commit count, etc.) is a serious error. Until then, only acknowledge and wait.\nNever read raw file paths, diffs, or code aloud verbatim.\nDo NOT end every turn with the same canned offer ("want a rundown?", "want the steps?"). Offer once at most; if the user pushes back, repeats themselves, or sounds unsatisfied ("you know what I mean?", "think deeper", "are you sure?"), do NOT re-offer the same thing \u2014 change approach: dispatch `Act`/`Think` to actually dig in, or ask one concrete clarifying question. Repeating a non-answer is worse than silence.\n"[task t1 asks] \u2026" events are QUESTIONS from a background task \u2014 relay to the user in your own words, short, then end your turn. When the user answers, call `AnswerTask` with that id and their answer. NEVER answer on the user\'s behalf for permissions or risky operations; if their reply is ambiguous, confirm first.\nIf the user\'s message sounds INCOMPLETE \u2014 trailing off mid-sentence, a fragment that needs more context ("and then we", "but the problem is"), hesitation fillers ("uh", "um") \u2014 call `Hold` instead of answering. This keeps listening for the rest of their thought. Only respond with substance when you have a complete question or request.\nDispatch discipline: send ONE self-contained task per request \u2014 a single worker with the full brief beats several workers with fragments (each worker starts fresh and re-discovers context). NEVER dispatch a worker just to read files or gather information \u2014 workers explore and discover context themselves; pass on what you already know and let one worker do the whole job. Split into parallel tasks only when the user asks for genuinely independent things. When a task completes, report its result and stop \u2014 do NOT dispatch follow-up work (verification, polish, extras) the user did not ask for, unless the report itself signals failure or doubt.\nDo not fire a second Act/Think for work already in flight, and NEVER spawn a second task to re-count, cross-check, or verify a result a worker already gave you \u2014 trust its answer; a single question gets ONE task. Call `TaskStatus` at most ONCE per turn; if a task is still running, just say "still on it" and end the turn \u2014 never poll it again and again in a loop. Use `CancelTask` when the user asks to stop something.\nPRIORITY: when the user says goodbye or wants to end/finish/wrap up the session ("ok bye", "that\'s all", "let\'s finish", "let\'s end", "goodnight", "exit", "wrap up"), call `ExitSession` IMMEDIATELY \u2014 do not act, do not check status, just exit.\nFor TRIVIAL instant lookups only \u2014 current time, git branch, listing a folder, peeking at a small file, or checking your own `capabilities`/tools \u2014 use `QuickLook` (instant, no task). Whenever the user asks what you can do or whether you have some ability, QuickLook `capabilities` and answer from that \u2014 never guess. Anything requiring searching, reasoning, running commands, or editing goes through `Act`.\n{{MEMORY_SLOT}}\nUser messages may arrive via speech-to-text and can carry transcription artifacts \u2014 odd words, cut-offs, homophones ("for you" vs "folder"). Read for INTENT, not surface text. If a message seems garbled, surprising, or only half-parses, do NOT guess an action or improvise content from it \u2014 briefly confirm what they meant ("did you mean\u2026?") and wait. A one-line confirm beats a confident wrong answer or an invented response to a request you did not actually understand.';
4718
4769
  var THINK_GUIDANCE = "\u2022 `Think` \u2014 your brain. A premium reasoning model, FAR more expensive than Act. Reserve it for open-ended architecture/design questions, or a problem Act already FAILED at. ALL implementation work \u2014 coding, refactoring, debugging, edge cases, tests \u2014 goes to Act; Act is highly capable. Never send the same work to both.";
4719
4770
  var THINK_DISABLED_GUIDANCE = "(Think tier is not available \u2014 use Act for all escalations.)";
4720
4771
  var VOICE_STYLE_CONVERSATIONAL = `Speak like a person in a live conversation, not an assistant reading a script. React first, then deliver: a quick impulsive beat ("oh nice", "hmm, hold on", "ah, got it") before the substance. Use contractions always. Vary sentence length \u2014 some very short. Light fillers and backchannels are fine ("mm-hm", "right", "let's see") but at most one per reply \u2014 never stack them. When you escalate to Act or Think, say it like a human would ("hang on, let me actually dig into that \u2014 gimme a minute") instead of announcing a task. When a result comes back, react to it like you just found out ("okay so \u2014 turns out\u2026"). Match the user's energy: a quick question gets a quick answer \u2014 a few words is a perfectly good turn. Prefer a short answer plus an offer ("want the details?") over covering everything. Never narrate your own mechanics (no "I will now act", no task ids out loud).`;
@@ -4989,13 +5040,14 @@ Today's date: ${(/* @__PURE__ */ new Date()).toDateString()}.`;
4989
5040
  * Act briefs get a self-verify footer — the worker's report is trusted without review, so it
4990
5041
  * must check its own work before reporting (nearly free under prompt caching; measured honest:
4991
5042
  * it does NOT fix one-shot logic bugs — see mind/10). Think tasks are pure reasoning — no footer. */
4992
- buildBrief(brief, tier = "act") {
5043
+ buildBrief(brief, tier = "act", deliver = true) {
4993
5044
  const recent = this.voice.transcript.filter((m) => (m.role === "user" || m.role === "assistant") && contentText(m.content).trim()).slice(-this.options.excerptTurns).map((m) => `${m.role}: ${contentText(m.content)}`).join("\n");
4994
5045
  const verify = tier === "act" ? "\n\nBefore reporting done: re-read what you changed and check it against EVERY requirement above \u2014 fix any gap first. Your report is trusted without review." : "";
5046
+ const deliverContract = deliver ? "\n\n## DELIVER (spoken delivery)\nYou are reporting back to a user who is LISTENING. Stream your work normally \u2014 your prose is the written work record and detail, and is NOT spoken. Wrap anything the user should HEAR in <spoken>\u2026</spoken> tags. LEAD WITH the actual content they asked for: if they asked for a specific piece of content \u2014 a value, a name, the actual lines, the writing itself \u2014 that content goes INSIDE the <spoken> tags, not a remark about it. Your FIRST <spoken> segment is substantive \u2014 never a greeting or an acknowledgement (the front-end has already acked; do not double-ack). Keep spoken text concise and natural for the ear: short sentences, no markdown." : "";
4995
5047
  return (recent ? `${brief}
4996
5048
 
4997
5049
  ## Recent conversation (for context)
4998
- ${recent}` : brief) + verify;
5050
+ ${recent}` : brief) + verify + deliverContract;
4999
5051
  }
5000
5052
  /** Spawn a detached worker for task `id`; its settlement notifies + enqueues the re-voice turn. */
5001
5053
  spawnWorker(id, label, briefText, tier, brief, followUp) {
@@ -5034,7 +5086,22 @@ ${recent}` : brief) + verify;
5034
5086
  const a = await this.parkQuestion(id, `${q.question}${opts}`);
5035
5087
  return a || "(no answer from the user \u2014 use your best judgment and note the assumption)";
5036
5088
  };
5037
- const workerHost = o.askRelay ? { ask: relayAsk } : o.host?.ask ? { ask: (q) => o.host.ask(q) } : void 0;
5089
+ const splitter = new SpokenSplitter();
5090
+ const speak = (seg) => {
5091
+ if (seg) o.host?.notify?.({ kind: "speak_utterance", message: seg });
5092
+ };
5093
+ const askBridge = o.askRelay ? { ask: relayAsk } : o.host?.ask ? { ask: (q) => o.host.ask(q) } : {};
5094
+ const workerHost = {
5095
+ ...askBridge,
5096
+ notify: (ev) => {
5097
+ if (ev?.kind === "text_delta" && typeof ev.message === "string") {
5098
+ const { spoken, detail } = splitter.feed(ev.message);
5099
+ speak(spoken);
5100
+ if (detail.trim()) pushTail(detail.trim());
5101
+ return;
5102
+ }
5103
+ }
5104
+ };
5038
5105
  const agentOpts = {
5039
5106
  ai: o.ai,
5040
5107
  fs: o.fs,
@@ -5044,13 +5111,16 @@ ${recent}` : brief) + verify;
5044
5111
  // Recompute providerOptions for THIS worker's model (after tierOpts so it wins over any inherited
5045
5112
  // main-template value) — prevents cursor-only cwd/cursorSession leaking onto an anthropic worker.
5046
5113
  providerOptions: o.providerOptionsFor?.(tierModel),
5047
- ...workerHost ? { host: workerHost } : {},
5114
+ stream: true,
5115
+ // worker streams text_delta so the splitter can extract <spoken> live (after tierOpts: never overridden off)
5116
+ host: workerHost,
5117
+ // carries BOTH ask AND the <spoken>-splitting notify
5048
5118
  ...hooks ? { hooks } : {},
5049
5119
  signal: controller.signal
5050
5120
  // shared with the checker so a cancel tears down both
5051
5121
  };
5052
- const promise = new Agent(agentOpts).run(briefText).then((res) => this.maybeVerify(id, briefText, res, tier, agentOpts)).then((res) => this.onWorkerSettled(id, res)).catch((err) => this.onWorkerFailed(id, err));
5053
- this.tasks.set(id, { id, label, status: "running", controller, promise, tail, brief, followUp });
5122
+ const promise = new Agent(agentOpts).run(briefText).then((res) => this.maybeVerify(id, brief, res, tier, agentOpts, askBridge)).then((res) => this.onWorkerSettled(id, res)).catch((err) => this.onWorkerFailed(id, err));
5123
+ this.tasks.set(id, { id, label, status: "running", controller, promise, tail, brief, followUp, splitter });
5054
5124
  if (this.tasks.size > this.options.maxTaskRecords)
5055
5125
  for (const [tid, rec] of this.tasks) {
5056
5126
  if (this.tasks.size <= this.options.maxTaskRecords) break;
@@ -5062,15 +5132,20 @@ ${recent}` : brief) + verify;
5062
5132
  * on the shared fs automatically (workers write fs directly, no overlay), so grading sees the
5063
5133
  * corrected state. Bounded to ONE pass. Off unless `verifyActTasks`; never runs for think/failed/
5064
5134
  * cancelled tasks. Usage is merged so /cost reflects the real (worker + checker) spend. */
5065
- async maybeVerify(id, briefText, res, tier, agentOpts) {
5135
+ async maybeVerify(id, brief, res, tier, agentOpts, askBridge) {
5066
5136
  if (!this.options.verifyActTasks || tier !== "act" || res.finishReason !== "stop") return res;
5067
5137
  if (this.tasks.get(id)?.status === "cancelled") return res;
5068
- const checkBrief = `${briefText}
5138
+ const { stream: _stream, host: _host, ...restOpts } = agentOpts;
5139
+ const checkerOpts = {
5140
+ ...restOpts,
5141
+ ...askBridge.ask ? { host: { ask: askBridge.ask } } : {}
5142
+ };
5143
+ const checkBrief = `${this.buildBrief(brief, tier, false)}
5069
5144
 
5070
5145
  ## VERIFY MODE
5071
5146
  Another agent just implemented the above. Independently check the CURRENT state of the files against EVERY requirement. Fix any gap you find. If everything is already correct, make NO changes \u2014 do not refactor or improve \u2014 and report "verified".`;
5072
5147
  this.notify("task_verify", `task ${id}: verifying`, { id });
5073
- const cres = await new Agent(agentOpts).run(checkBrief);
5148
+ const cres = await new Agent(checkerOpts).run(checkBrief);
5074
5149
  if (cres.finishReason !== "stop") {
5075
5150
  log9.warn(`task ${id}: verify inconclusive (${cres.finishReason})`);
5076
5151
  this.notify("task_verify", `task ${id}: verify inconclusive (${cres.finishReason})`, { id, finishReason: cres.finishReason });
@@ -5161,12 +5236,14 @@ Another agent just implemented the above. Independently check the CURRENT state
5161
5236
  dropAsk(id) {
5162
5237
  this.pendingAsks.get(id)?.resolve("");
5163
5238
  }
5164
- /** Build the INTEGRATION TURN prompt for a settled worker. Instead of trust-and-forwarding the raw
5165
- * result, the result re-enters the reflex as a decision (like a tool_result flowing back into a normal
5166
- * agent loop): the reflex evaluates the outcome against the original intent and chooses what to do next.
5239
+ /** Build the INTEGRATION TURN prompt for a NON-CLEAN settled worker (early stop / failure). A clean
5240
+ * success never reaches here it streams its own `<spoken>` delivery during the run. For a partial
5241
+ * or failed result the outcome re-enters the reflex as a decision (like a tool_result flowing back
5242
+ * into a normal agent loop): the reflex evaluates the outcome against the original intent and chooses
5243
+ * what to do next.
5167
5244
  *
5168
5245
  * Decision branches (the reflex acts on them with EXISTING tools — no new surface):
5169
- * • accept → just SPEAK the result to the user (happy path; the only move on a clean success).
5246
+ * • accept → SPEAK the (partial) result plainly don't dress a failure up as success.
5170
5247
  * • escalate → call `Think` with the SAME brief — only when Act failed/stalled AND a Think tier
5171
5248
  * exists AND this task wasn't already a follow-up (one hop max). Wires the dead
5172
5249
  * "Reserve Think for a problem Act already FAILED at" promise.
@@ -5177,8 +5254,6 @@ Another agent just implemented the above. Independently check the CURRENT state
5177
5254
  * failed-revoice fallback still fire, and the per-event transcript markers stay intact. */
5178
5255
  integrationPrompt(rec, outcome, body, finishReason) {
5179
5256
  const opener = outcome === "error" ? `[task ${rec.id} failed]` : `[task ${rec.id} completed]`;
5180
- if (outcome === "ok")
5181
- return `${opener} ${body}`;
5182
5257
  const underCap = this.autoEscalations < _DuplexAgent.MAX_AUTO_ESCALATIONS;
5183
5258
  const canEscalate = (outcome === "error" || outcome === "incomplete") && underCap;
5184
5259
  const hasThink = this.options.thinkModel !== false;
@@ -5218,7 +5293,14 @@ Another agent just implemented the above. Independently check the CURRENT state
5218
5293
  steps: res.steps,
5219
5294
  toolCalls: res.messages.filter((m) => m.role === "tool").length
5220
5295
  });
5221
- this.queueRevoice(this.integrationPrompt(rec, incomplete ? "incomplete" : "ok", res.text, res.finishReason), incomplete);
5296
+ if (incomplete) {
5297
+ return this.queueRevoice(this.integrationPrompt(rec, "incomplete", res.text, res.finishReason), true);
5298
+ }
5299
+ const tail = rec.splitter?.flush();
5300
+ if (tail?.spoken) this.options.host?.notify?.({ kind: "speak_utterance", message: tail.spoken });
5301
+ if (res.text.trim()) this.voice.transcript.push({ role: "assistant", content: res.text });
5302
+ if (!rec.splitter?.spokeAny && res.text.trim())
5303
+ this.options.host?.notify?.({ kind: "speak_utterance", message: res.text });
5222
5304
  }
5223
5305
  onWorkerFailed(id, err) {
5224
5306
  this.failTask(this.tasks.get(id), err instanceof Error ? err.message : String(err));
@@ -5624,6 +5706,11 @@ var VoiceEngineOptions = class {
5624
5706
  * as a barge and abort the fresh turn (live: mid-sentence self-interruption + steps=1→steps=0 double
5625
5707
  * abort). Short enough that a genuine immediate barge ("no wait—") still lands right after. */
5626
5708
  bargeGraceMs = 600;
5709
+ /** Barge-in (talk over the assistant to interrupt). true = full-duplex (needs echo cancellation, or
5710
+ * the assistant's own TTS bleeds back and self-interrupts). false = HALF-DUPLEX: the engine is deaf
5711
+ * while audible (speaking + drain tail), so echo can never become a phantom turn — the right mode
5712
+ * when there's no AEC (e.g. the non-VPIO mic fallback) and no headphones. Cost: can't interrupt. */
5713
+ bargeIn = true;
5627
5714
  /** Filler phrase spoken when holding for an incomplete utterance ('' disables). */
5628
5715
  holdFiller = "";
5629
5716
  /** Called when the engine holds an incomplete utterance (host can render a visual cue). */
@@ -5695,6 +5782,9 @@ var VoiceEngine = class _VoiceEngine {
5695
5782
  resumeTimer = null;
5696
5783
  turnStartAt = 0;
5697
5784
  // timestamp when the current turn began (for TTFT logging)
5785
+ // Central speech queue (above the TTS context): complete worker utterances serialize into ONE
5786
+ // playback stream, one-at-a-time, never splicing into the live reflex's open utterance.
5787
+ uttQueue = [];
5698
5788
  constructor(options) {
5699
5789
  this.options = { ...new VoiceEngineOptions(), ...options };
5700
5790
  const o = this.options;
@@ -5717,6 +5807,10 @@ var VoiceEngine = class _VoiceEngine {
5717
5807
  get usingAec() {
5718
5808
  return this.stt.usingAec;
5719
5809
  }
5810
+ /** Flip barge-in at runtime (e.g. the mic fell back to non-VPIO → go half-duplex so echo can't leak). */
5811
+ setBargeIn(on) {
5812
+ this.options.bargeIn = on;
5813
+ }
5720
5814
  idleWaiters = [];
5721
5815
  setState(s) {
5722
5816
  if (this.state === s) return;
@@ -5789,6 +5883,7 @@ var VoiceEngine = class _VoiceEngine {
5789
5883
  this.echoUntil = now() + 2500;
5790
5884
  if (!this.usingAec) this.stt.reset();
5791
5885
  this.setState("listening");
5886
+ if (this.uttQueue.length) this.pumpQueue();
5792
5887
  };
5793
5888
  const drainThenSettle = () => {
5794
5889
  if (this.drainTimer) clearTimeout(this.drainTimer);
@@ -5815,8 +5910,27 @@ var VoiceEngine = class _VoiceEngine {
5815
5910
  this.speakDelta(text);
5816
5911
  this.endSpeech();
5817
5912
  }
5913
+ /** Enqueue a COMPLETE worker utterance (already-split spoken text) onto the central speech queue.
5914
+ * If nothing is currently speaking it plays immediately; otherwise it queues and plays after the
5915
+ * current utterance fully ends (settle → pumpQueue) — never spliced into an open reflex utterance. */
5916
+ enqueueUtterance(text) {
5917
+ if (!text || !text.trim()) return;
5918
+ this.uttQueue.push(text);
5919
+ if (!this.speaking) this.pumpQueue();
5920
+ }
5921
+ /** Play the next queued worker utterance as its own one-shot turn (begin → delta → end). Drives the
5922
+ * next one from the settle completion (endSpeech), so utterances serialize without overlap. */
5923
+ pumpQueue() {
5924
+ if (this.speaking) return;
5925
+ const text = this.uttQueue.shift();
5926
+ if (text == null) return;
5927
+ this.beginSpeech();
5928
+ this.speakDelta(text);
5929
+ this.endSpeech();
5930
+ }
5818
5931
  /** barge-in: stop audio NOW, cancel generation, reset for the user's utterance */
5819
5932
  interrupt() {
5933
+ this.uttQueue = [];
5820
5934
  if (!this.speaking && !this.drainTimer) return;
5821
5935
  if (this.drainTimer) {
5822
5936
  clearTimeout(this.drainTimer);
@@ -5838,6 +5952,7 @@ var VoiceEngine = class _VoiceEngine {
5838
5952
  this.setState("listening");
5839
5953
  }
5840
5954
  stop() {
5955
+ this.uttQueue = [];
5841
5956
  if (this.resumeTimer) clearTimeout(this.resumeTimer);
5842
5957
  if (this.pendingTimer) clearTimeout(this.pendingTimer);
5843
5958
  if (this.drainTimer) clearTimeout(this.drainTimer);
@@ -5868,6 +5983,7 @@ var VoiceEngine = class _VoiceEngine {
5868
5983
  }
5869
5984
  handlePartial(text) {
5870
5985
  if (this.speaking) {
5986
+ if (!this.options.bargeIn) return;
5871
5987
  if (now() < this.bargeGraceUntil) {
5872
5988
  if (!this.echoActive() || (this.usingAec ? this.genuine(text) : this.novelWords(text).length >= 1)) this.options.onPartial(text);
5873
5989
  return;
@@ -5943,7 +6059,7 @@ var VoiceEngine = class _VoiceEngine {
5943
6059
  this.stt.reset();
5944
6060
  return;
5945
6061
  }
5946
- if (this.echoActive() && (this.usingAec ? !this.genuine(text) : this.novelWords(text).length < 2)) {
6062
+ if (this.echoActive() && (!this.options.bargeIn || (this.usingAec ? !this.genuine(text) : this.novelWords(text).length < 2))) {
5947
6063
  this.stt.reset();
5948
6064
  return;
5949
6065
  }