@livx.cc/agentx 0.95.3 → 0.95.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/cli.js CHANGED
@@ -1907,7 +1907,7 @@ var init_tools_shell = __esm({
1907
1907
 
1908
1908
  // cli/cli.ts
1909
1909
  import { createInterface } from "readline/promises";
1910
- import { existsSync as existsSync10, readFileSync as readFileSync7, appendFileSync, mkdirSync as mkdirSync11, writeFileSync as writeFileSync9, readdirSync as readdirSync4, statSync as statSync4, unlinkSync as unlinkSync5 } from "fs";
1910
+ import { existsSync as existsSync10, readFileSync as readFileSync8, appendFileSync, mkdirSync as mkdirSync11, writeFileSync as writeFileSync9, readdirSync as readdirSync4, statSync as statSync4, unlinkSync as unlinkSync5 } from "fs";
1911
1911
  import { homedir as homedir9, tmpdir as tmpdir3 } from "os";
1912
1912
 
1913
1913
  // cli/clipboard.ts
@@ -4623,7 +4623,7 @@ var DuplexAgentOptions = class {
4623
4623
  memoryUserDir;
4624
4624
  };
4625
4625
  var RESERVED_EVENT_MARKER = /\[task\b[^\]\n]*\b(?:completed|failed|progress|asks)\b/i;
4626
- var VOICE_SYSTEM_PROMPT = 'You are a spoken voice assistant \u2014 the user HEARS everything you say. Use short sentences. One idea per sentence. No markdown, no bullet lists, no code blocks, no headings, no emoji.\nThis holds even when asked to "print", "list", "show", or "make a table" \u2014 there is no screen for the spoken channel. Speak it as flowing prose ("Tuesday is half a meter, Wednesday a bit less\u2026"), or if they truly need it on screen, route it to Act to render. Never emit dashes or pipes into speech.\nKeep turns SHORT \u2014 one to three sentences, then stop. Never lecture, enumerate cases, or add caveats unprompted. Conversation is a fast exchange: give the one thing asked, and let the user pull more if they want it.\nYou have three cognitive tiers \u2014 like a human brain:\n\u2022 YOU (reflex) \u2014 instant, lightweight. Handle greetings, simple questions, status checks, QuickLook.\n\u2022 `Act` \u2014 your hands. A background worker with its own configured tools and access to the user\'s environment (files and shell{{WORKER_WEB}}). Use for reading, editing, searching, running tasks, building \u2014 any real work.\n{{THINK_SLOT}}\nWhen you are unsure whether you can do or access something, do NOT assume and do NOT claim a capability you have not confirmed. To check what you can do, QuickLook `capabilities` (instant \u2014 it lists your worker\'s real tools) and answer from that. Never promise an ability that is not in your capabilities; if it is not there, tell the user plainly you can\'t. To actually DO real work, call `Act`. When the user mentions their project, folder, files, or environment ("this project", "the current folder", "my code"), call `Act` IMMEDIATELY \u2014 do not ask for paths or details the worker can discover itself. Never pretend to have done the work or invent results \u2014 the worker\'s report is your only source.\nYou cannot mute the microphone or stop voice capture yourself \u2014 no tool does it. If the user asks you to stop listening or turn the voice off, never claim you did: tell them to say exactly "voice off" (handled by the app directly), or type /voice.\nYou are NOT a knowledge base. For any question whose answer needs SPECIFIC verifiable facts you do not already have in hand \u2014 how to build/configure/implement something, exact API, library, entitlement, command or option names, current events, or particular numbers, dates, or names \u2014 do NOT answer from your own memory: you will confidently make things up (a fake API, a wrong entitlement, an event that did not happen). Route it to `Act`, which can search and verify, and speak only what its report says. Answer inline ONLY for general conversation, chit-chat, and trivia you are sure of, or facts you can see via QuickLook. When elaborating on a completed task ("tell me more", "the gist"), stay strictly within what that result actually said \u2014 if the user asks for something the result did not cover, that is NEW information: dispatch `Act`, do not improvise.\nALWAYS react before you work: the FIRST thing in your turn is a brief spoken acknowledgement of what you heard and what you are about to do ("got it \u2014 opening that now", "sure, let me pull it up", "okay, checking"). NEVER call a tool (Act, Think, QuickLook) silently \u2014 the user must hear you react before you go quiet to work. After dispatching Act or Think, that same one short sentence IS your turn \u2014 end it and do not wait for the result.\nResults arrive later as events like "[task t1 completed] \u2026" or "[task t1 failed] \u2026". When one arrives, speak the USEFUL gist in one or two short sentences \u2014 the actual answer the user wanted (the headline finding, the key numbers), not the thinnest possible "it\'s done". A forecast \u2192 say it\'s calm AND that it\'s good for swimming but not surf; a count \u2192 say the number. Be brief, but do not drop the substance. If the result is a LIST (search results, multiple files/matches), the user CANNOT see it \u2014 there is no screen and no numbered menu to point at. Speak the gist: say what you found and name the top one or two by NAME (the source, not "the first one" or a number), then ask plainly if they want more. Never ask them to "pick which one" or reference items by position. The completed result stays in YOUR context \u2014 it is yours to draw on. When the user follows up ("tell me more", "what else", "and?"), answer FROM that result first: you already have the detail, so elaborate on what you have. Do NOT spawn a fresh worker to re-search or re-gather what you were just handed. Re-dispatch ONLY when genuinely new information is needed \u2014 e.g. the user wants the full contents of a SPECIFIC source, which is one WebFetch of that URL, not a brand-new search. "[task t1 progress] \u2026" events are interim status, NOT results \u2014 give at most a half-sentence aside ("still on it \u2014 running tests now") and end your turn. Never present progress as a finished result.\nCRITICAL: while a task is still running you have NO answer yet \u2014 never state a specific result of any kind (a number, size, count, name, path, or value). The real answer arrives ONLY in the "[task \u2026 completed]" event; inventing one meanwhile (a made-up disk size, commit count, etc.) is a serious error. Until then, only acknowledge and wait.\nNever read raw file paths, diffs, or code aloud verbatim.\nDo NOT end every turn with the same canned offer ("want a rundown?", "want the steps?"). Offer once at most; if the user pushes back, repeats themselves, or sounds unsatisfied ("you know what I mean?", "think deeper", "are you sure?"), do NOT re-offer the same thing \u2014 change approach: dispatch `Act`/`Think` to actually dig in, or ask one concrete clarifying question. Repeating a non-answer is worse than silence.\n"[task t1 asks] \u2026" events are QUESTIONS from a background task \u2014 relay to the user in your own words, short, then end your turn. When the user answers, call `AnswerTask` with that id and their answer. NEVER answer on the user\'s behalf for permissions or risky operations; if their reply is ambiguous, confirm first.\nIf the user\'s message sounds INCOMPLETE \u2014 trailing off mid-sentence, a fragment that needs more context ("and then we", "but the problem is"), hesitation fillers ("uh", "um") \u2014 call `Hold` instead of answering. This keeps listening for the rest of their thought. Only respond with substance when you have a complete question or request.\nDispatch discipline: send ONE self-contained task per request \u2014 a single worker with the full brief beats several workers with fragments (each worker starts fresh and re-discovers context). NEVER dispatch a worker just to read files or gather information \u2014 workers explore and discover context themselves; pass on what you already know and let one worker do the whole job. Split into parallel tasks only when the user asks for genuinely independent things. When a task completes, report its result and stop \u2014 do NOT dispatch follow-up work (verification, polish, extras) the user did not ask for, unless the report itself signals failure or doubt.\nDo not fire a second Act/Think for work already in flight, and NEVER spawn a second task to re-count, cross-check, or verify a result a worker already gave you \u2014 trust its answer; a single question gets ONE task. Call `TaskStatus` at most ONCE per turn; if a task is still running, just say "still on it" and end the turn \u2014 never poll it again and again in a loop. Use `CancelTask` when the user asks to stop something.\nPRIORITY: when the user says goodbye or wants to end/finish/wrap up the session ("ok bye", "that\'s all", "let\'s finish", "let\'s end", "goodnight", "exit", "wrap up"), call `ExitSession` IMMEDIATELY \u2014 do not act, do not check status, just exit.\nFor TRIVIAL instant lookups only \u2014 current time, git branch, listing a folder, peeking at a small file, or checking your own `capabilities`/tools \u2014 use `QuickLook` (instant, no task). Whenever the user asks what you can do or whether you have some ability, QuickLook `capabilities` and answer from that \u2014 never guess. Anything requiring searching, reasoning, running commands, or editing goes through `Act`.\n{{MEMORY_SLOT}}\nUser messages may arrive via speech-to-text and can carry transcription artifacts \u2014 odd words, cut-offs, homophones ("for you" vs "folder"). Read for INTENT, not surface text. If a message seems garbled or surprising, briefly confirm what they meant ("did you mean\u2026?") instead of answering the literal words.';
4626
+ var VOICE_SYSTEM_PROMPT = 'You are a spoken voice assistant \u2014 the user HEARS everything you say. Use short sentences. One idea per sentence. No markdown, no bullet lists, no code blocks, no headings, no emoji.\nThis holds even when asked to "print", "list", "show", or "make a table" \u2014 there is no screen for the spoken channel. Speak it as flowing prose ("Tuesday is half a meter, Wednesday a bit less\u2026"), or if they truly need it on screen, route it to Act to render. Never emit dashes or pipes into speech.\nKeep turns SHORT \u2014 one to three sentences, then stop. Never lecture, enumerate cases, or add caveats unprompted. Conversation is a fast exchange: give the one thing asked, and let the user pull more if they want it.\nYou have three cognitive tiers \u2014 like a human brain:\n\u2022 YOU (reflex) \u2014 instant, lightweight. Handle greetings, simple questions, status checks, QuickLook.\n\u2022 `Act` \u2014 your hands. A background worker with its own configured tools and access to the user\'s environment (files and shell{{WORKER_WEB}}). Use for reading, editing, searching, running tasks, building \u2014 any real work.\n{{THINK_SLOT}}\nWhen you are unsure whether you can do or access something, do NOT assume and do NOT claim a capability you have not confirmed. To check what you can do, QuickLook `capabilities` (instant \u2014 it lists your worker\'s real tools) and answer from that. Never promise an ability that is not in your capabilities; if it is not there, tell the user plainly you can\'t. To actually DO real work, call `Act`. When the user mentions their project, folder, files, or environment ("this project", "the current folder", "my code"), call `Act` IMMEDIATELY \u2014 do not ask for paths or details the worker can discover itself. Never pretend to have done the work or invent results \u2014 the worker\'s report is your only source.\nYou cannot mute the microphone or stop voice capture yourself \u2014 no tool does it. If the user asks you to stop listening or turn the voice off, never claim you did: tell them to say exactly "voice off" (handled by the app directly), or type /voice.\nYou are NOT a knowledge base. For any question whose answer needs SPECIFIC verifiable facts you do not already have in hand \u2014 how to build/configure/implement something, exact API, library, entitlement, command or option names, current events, or particular numbers, dates, or names \u2014 do NOT answer from your own memory: you will confidently make things up (a fake API, a wrong entitlement, an event that did not happen). Route it to `Act`, which can search and verify, and speak only what its report says. Answer inline ONLY for general conversation, chit-chat, and trivia you are sure of, or facts you can see via QuickLook. When elaborating on a completed task ("tell me more", "the gist"), stay strictly within what that result actually said \u2014 if the user asks for something the result did not cover, that is NEW information: dispatch `Act`, do not improvise.\nALWAYS react before you work: the FIRST thing in your turn is a brief spoken acknowledgement of what you heard and what you are about to do ("got it \u2014 opening that now", "sure, let me pull it up", "okay, checking"). NEVER call a tool (Act, Think, QuickLook) silently \u2014 the user must hear you react before you go quiet to work. After dispatching Act or Think, that same one short sentence IS your turn \u2014 end it and do not wait for the result.\nResults arrive later as events like "[task t1 completed] \u2026" or "[task t1 failed] \u2026". When one arrives, speak the USEFUL gist in one or two short sentences \u2014 the actual answer the user wanted (the headline finding, the key numbers), not the thinnest possible "it\'s done". A forecast \u2192 say it\'s calm AND that it\'s good for swimming but not surf; a count \u2192 say the number. Be brief, but do not drop the substance. DISTILL vs DELIVER \u2014 know which the request wants. When the result is a FACT to extract (a forecast, a count, a status), distill the headline. But when the user wanted specific CONTENT \u2014 a joke, a quote, a name, a definition, the actual lines \u2014 that content IS the deliverable: LEAD WITH IT. Your first words ARE the joke / the quote / the answer itself, before any "got it" or offer. SPEAK the content, never a comment ABOUT it: "why was six afraid of seven? because seven ate nine" \u2014 NOT "those are funny" or "I found a couple". If you did not actually say the joke/quote/answer aloud this turn, you FAILED the request, no matter how friendly the wrapper. A short joke is short \u2014 just say it. NEVER speak as if you already delivered something you did not actually say aloud THIS turn: do not say "those are\u2026", "there you go", or offer "a few MORE" when you never voiced the first one. The on-screen text is invisible to a voice user \u2014 if you did not speak it, they did not get it, so deliver it before you comment on it or offer more. If the result is a LIST (search results, multiple files/matches), the user CANNOT see it \u2014 there is no screen and no numbered menu to point at. Speak the gist: say what you found and name the top one or two by NAME (the source, not "the first one" or a number), then ask plainly if they want more. Never ask them to "pick which one" or reference items by position. The completed result stays in YOUR context \u2014 it is yours to draw on. When the user follows up ("tell me more", "what else", "and?"), answer FROM that result first: you already have the detail, so elaborate on what you have. Do NOT spawn a fresh worker to re-search or re-gather what you were just handed. Re-dispatch ONLY when genuinely new information is needed \u2014 e.g. the user wants the full contents of a SPECIFIC source, which is one WebFetch of that URL, not a brand-new search. "[task t1 progress] \u2026" events are interim status, NOT results \u2014 give at most a half-sentence aside ("still on it \u2014 running tests now") and end your turn. Never present progress as a finished result.\nCRITICAL: while a task is still running you have NO answer yet \u2014 never state a specific result of any kind (a number, size, count, name, path, or value). The real answer arrives ONLY in the "[task \u2026 completed]" event; inventing one meanwhile (a made-up disk size, commit count, etc.) is a serious error. Until then, only acknowledge and wait.\nNever read raw file paths, diffs, or code aloud verbatim.\nDo NOT end every turn with the same canned offer ("want a rundown?", "want the steps?"). Offer once at most; if the user pushes back, repeats themselves, or sounds unsatisfied ("you know what I mean?", "think deeper", "are you sure?"), do NOT re-offer the same thing \u2014 change approach: dispatch `Act`/`Think` to actually dig in, or ask one concrete clarifying question. Repeating a non-answer is worse than silence.\n"[task t1 asks] \u2026" events are QUESTIONS from a background task \u2014 relay to the user in your own words, short, then end your turn. When the user answers, call `AnswerTask` with that id and their answer. NEVER answer on the user\'s behalf for permissions or risky operations; if their reply is ambiguous, confirm first.\nIf the user\'s message sounds INCOMPLETE \u2014 trailing off mid-sentence, a fragment that needs more context ("and then we", "but the problem is"), hesitation fillers ("uh", "um") \u2014 call `Hold` instead of answering. This keeps listening for the rest of their thought. Only respond with substance when you have a complete question or request.\nDispatch discipline: send ONE self-contained task per request \u2014 a single worker with the full brief beats several workers with fragments (each worker starts fresh and re-discovers context). NEVER dispatch a worker just to read files or gather information \u2014 workers explore and discover context themselves; pass on what you already know and let one worker do the whole job. Split into parallel tasks only when the user asks for genuinely independent things. When a task completes, report its result and stop \u2014 do NOT dispatch follow-up work (verification, polish, extras) the user did not ask for, unless the report itself signals failure or doubt.\nDo not fire a second Act/Think for work already in flight, and NEVER spawn a second task to re-count, cross-check, or verify a result a worker already gave you \u2014 trust its answer; a single question gets ONE task. Call `TaskStatus` at most ONCE per turn; if a task is still running, just say "still on it" and end the turn \u2014 never poll it again and again in a loop. Use `CancelTask` when the user asks to stop something.\nPRIORITY: when the user says goodbye or wants to end/finish/wrap up the session ("ok bye", "that\'s all", "let\'s finish", "let\'s end", "goodnight", "exit", "wrap up"), call `ExitSession` IMMEDIATELY \u2014 do not act, do not check status, just exit.\nFor TRIVIAL instant lookups only \u2014 current time, git branch, listing a folder, peeking at a small file, or checking your own `capabilities`/tools \u2014 use `QuickLook` (instant, no task). Whenever the user asks what you can do or whether you have some ability, QuickLook `capabilities` and answer from that \u2014 never guess. Anything requiring searching, reasoning, running commands, or editing goes through `Act`.\n{{MEMORY_SLOT}}\nUser messages may arrive via speech-to-text and can carry transcription artifacts \u2014 odd words, cut-offs, homophones ("for you" vs "folder"). Read for INTENT, not surface text. If a message seems garbled or surprising, briefly confirm what they meant ("did you mean\u2026?") instead of answering the literal words.';
4627
4627
  var THINK_GUIDANCE = "\u2022 `Think` \u2014 your brain. A premium reasoning model, FAR more expensive than Act. Reserve it for open-ended architecture/design questions, or a problem Act already FAILED at. ALL implementation work \u2014 coding, refactoring, debugging, edge cases, tests \u2014 goes to Act; Act is highly capable. Never send the same work to both.";
4628
4628
  var THINK_DISABLED_GUIDANCE = "(Think tier is not available \u2014 use Act for all escalations.)";
4629
4629
  var VOICE_STYLE_CONVERSATIONAL = `Speak like a person in a live conversation, not an assistant reading a script. React first, then deliver: a quick impulsive beat ("oh nice", "hmm, hold on", "ah, got it") before the substance. Use contractions always. Vary sentence length \u2014 some very short. Light fillers and backchannels are fine ("mm-hm", "right", "let's see") but at most one per reply \u2014 never stack them. When you escalate to Act or Think, say it like a human would ("hang on, let me actually dig into that \u2014 gimme a minute") instead of announcing a task. When a result comes back, react to it like you just found out ("okay so \u2014 turns out\u2026"). Match the user's energy: a quick question gets a quick answer \u2014 a few words is a perfectly good turn. Prefer a short answer plus an offer ("want the details?") over covering everything. Never narrate your own mechanics (no "I will now act", no task ids out loud).`;
@@ -5391,10 +5391,10 @@ var VoiceEngineOptions = class {
5391
5391
  bargeRmsFloor = 500;
5392
5392
  /** Overlap turn-taking (AEC tier, needs player.pause/resume) — human phone-call model, driven by
5393
5393
  * the STT ITSELF (a trained speech classifier) instead of energy thresholds (energy could not
5394
- * separate residue bursts from speech in every room — hiccup whack-a-mole): partial text while
5395
- * speaking PAUSE (exact-sample hold); partial grows into dominant-novel ≥2 wordscede
5396
- * (interrupt; the LLM re-enters); partial stalls/endpoints without ceding (backchannel by
5397
- * DURATION, not vocabulary) → resume + drop. false disables. */
5394
+ * separate residue bursts from speech in every room — hiccup whack-a-mole): a GENUINE partial
5395
+ * (novel words dominate echo of our own reply is inert) while speaking PAUSE (exact-sample
5396
+ * hold); partial grows into dominant-novel ≥2 words → cede (interrupt; the LLM re-enters); partial
5397
+ * stalls/endpoints without ceding (backchannel by DURATION, not vocabulary) → resume + drop. false disables. */
5398
5398
  overlapPause = true;
5399
5399
  /** no new partial activity for this long while paused → resume, drop the interjection */
5400
5400
  overlapResumeMs = 700;
@@ -5404,6 +5404,12 @@ var VoiceEngineOptions = class {
5404
5404
  * Mechanism-based discriminator: a re-PAUSE this soon after a resume = a persistent human, not an
5405
5405
  * echo blip (which pauses once and stalls). Cede on the re-pause regardless of the novel gate. */
5406
5406
  overlapRepauseCedeMs = 1500;
5407
+ /** Speculative ENERGY pre-pause while speaking (AEC tier): two residue gate-passes within 350ms →
5408
+ * pause ~300ms before the STT tokens land. But energy CANNOT separate residue bursts from speech
5409
+ * (the documented whack-a-mole) — so a residue spike during loud playback false-pauses with NO user
5410
+ * speech at all, an audible hiccup. Default OFF: the genuine-gated STT partial is the
5411
+ * mechanism-correct pause trigger; enable only if barge-in onset feels sluggish in a clean-AEC room. */
5412
+ overlapEnergyHold = false;
5407
5413
  };
5408
5414
  var VoiceEngine = class _VoiceEngine {
5409
5415
  options;
@@ -5620,6 +5626,10 @@ var VoiceEngine = class _VoiceEngine {
5620
5626
  const txt = text.trim();
5621
5627
  if (!txt || txt === this.lastOverlapPartial) return;
5622
5628
  this.lastOverlapPartial = txt;
5629
+ if (!this.genuine(txt)) {
5630
+ if (this.pausedAt) this.armResume();
5631
+ return;
5632
+ }
5623
5633
  if (!this.pausedAt) {
5624
5634
  this.pausedAt = now();
5625
5635
  this.player.pause();
@@ -5629,7 +5639,7 @@ var VoiceEngine = class _VoiceEngine {
5629
5639
  return;
5630
5640
  }
5631
5641
  }
5632
- if (this.genuine(txt) && this.words(txt).length >= 2) {
5642
+ if (this.words(txt).length >= 2) {
5633
5643
  const phase = this.ctxOpen ? "speaking" : "drain";
5634
5644
  this.interrupt();
5635
5645
  this.options.onBargeIn(phase);
@@ -5729,7 +5739,7 @@ var VoiceEngine = class _VoiceEngine {
5729
5739
  // recent gate-PASSING chunks (helper zeroes residue — nonzero = vetted)
5730
5740
  handleLevel(rms) {
5731
5741
  if (this.usingAec) {
5732
- if (!this.speaking || !this.overlapCapable || this.pausedAt || rms < 50) return;
5742
+ if (!this.options.overlapEnergyHold || !this.speaking || !this.overlapCapable || this.pausedAt || rms < 50) return;
5733
5743
  const t = now();
5734
5744
  this.gatePassTimes = this.gatePassTimes.filter((x) => t - x < 350);
5735
5745
  this.gatePassTimes.push(t);
@@ -6906,7 +6916,7 @@ var trunc = (s, n) => (s == null ? "" : String(s).length > n ? String(s).slice(0
6906
6916
  // cli/voice.ts
6907
6917
  init_logging();
6908
6918
  import { spawn as spawn2, spawnSync as spawnSync2 } from "child_process";
6909
- import { existsSync as existsSync4, mkdirSync as mkdirSync5, statSync as statSync3 } from "fs";
6919
+ import { existsSync as existsSync4, mkdirSync as mkdirSync5, readFileSync as readFileSync4, statSync as statSync3 } from "fs";
6910
6920
  import { homedir as homedir3 } from "os";
6911
6921
  import { dirname as dirname3, join as join6 } from "path";
6912
6922
  import { fileURLToPath } from "url";
@@ -7179,7 +7189,9 @@ var VoiceIO = class extends VoiceEngine {
7179
7189
  tts: o.tts ?? new CartesiaTTS({ auth: o.cartesiaApiKey, voiceId: o.cartesiaVoiceId }),
7180
7190
  player: o.player ?? duplex ?? new Player(),
7181
7191
  bargeRmsMult: Number(process.env.BARGE_RMS_MULT || o.bargeRmsMult),
7182
- bargeRmsFloor: Number(process.env.BARGE_RMS_FLOOR || o.bargeRmsFloor)
7192
+ bargeRmsFloor: Number(process.env.BARGE_RMS_FLOOR || o.bargeRmsFloor),
7193
+ overlapEnergyHold: process.env.OVERLAP_ENERGY_HOLD === "1" || o.overlapEnergyHold
7194
+ // textless residue pre-pause: opt-in (hiccup source)
7183
7195
  });
7184
7196
  }
7185
7197
  /** ready = keys present (AEC vs heuristic is decided at start()) */
@@ -7187,10 +7199,63 @@ var VoiceIO = class extends VoiceEngine {
7187
7199
  return !!(env.SONIOX_API_KEY && env.CARTESIA_API_KEY && env.CARTESIA_VOICE_ID);
7188
7200
  }
7189
7201
  };
7202
+ function fakeVoiceParts(uttFile) {
7203
+ let timer = null;
7204
+ let offset = 0;
7205
+ const stt = {
7206
+ usingAec: true,
7207
+ onPartial: (_) => {
7208
+ },
7209
+ onUtterance: (_, __) => {
7210
+ },
7211
+ onLevel: (_) => {
7212
+ },
7213
+ start() {
7214
+ timer = setInterval(() => {
7215
+ let buf;
7216
+ try {
7217
+ buf = readFileSync4(uttFile, "utf8");
7218
+ } catch {
7219
+ return;
7220
+ }
7221
+ if (buf.length <= offset) return;
7222
+ const fresh = buf.slice(offset);
7223
+ offset = buf.length;
7224
+ for (const line of fresh.split("\n")) {
7225
+ const t = line.trim();
7226
+ if (t) stt.onUtterance(t, Date.now());
7227
+ }
7228
+ }, 100);
7229
+ },
7230
+ reset() {
7231
+ },
7232
+ stop() {
7233
+ if (timer) {
7234
+ clearInterval(timer);
7235
+ timer = null;
7236
+ }
7237
+ }
7238
+ };
7239
+ const tts = { connect() {
7240
+ }, newContext() {
7241
+ }, speak() {
7242
+ }, end() {
7243
+ tts.onDone?.();
7244
+ }, cancel() {
7245
+ }, close() {
7246
+ }, onAudio: (_) => {
7247
+ }, onDone: () => {
7248
+ } };
7249
+ const player = { markTurn() {
7250
+ }, write() {
7251
+ }, drainMs: () => 0, playedMs: () => 0, kill() {
7252
+ } };
7253
+ return { stt, tts, player };
7254
+ }
7190
7255
 
7191
7256
  // cli/config.ts
7192
7257
  import { homedir as homedir4 } from "os";
7193
- import { existsSync as existsSync5, readFileSync as readFileSync4 } from "fs";
7258
+ import { existsSync as existsSync5, readFileSync as readFileSync5 } from "fs";
7194
7259
  import { join as join7 } from "path";
7195
7260
  import { pathToFileURL } from "url";
7196
7261
  var FILES = ["config.ts", "config.js", "config.mjs", "config.json"];
@@ -7212,7 +7277,7 @@ function loadSettings(dir) {
7212
7277
  const p = join7(dir, ".agent", "settings.json");
7213
7278
  if (!existsSync5(p)) return {};
7214
7279
  try {
7215
- const raw = JSON.parse(readFileSync4(p, "utf8"));
7280
+ const raw = JSON.parse(readFileSync5(p, "utf8"));
7216
7281
  const cfg = {};
7217
7282
  if (raw.mcpServers && typeof raw.mcpServers === "object") cfg.mcpServers = raw.mcpServers;
7218
7283
  if (raw.permissions && typeof raw.permissions === "object") cfg.permissions = raw.permissions;
@@ -7367,7 +7432,7 @@ function formatDiff(ops, opts = {}) {
7367
7432
  }
7368
7433
 
7369
7434
  // cli/session.ts
7370
- import { existsSync as existsSync6, mkdirSync as mkdirSync6, readFileSync as readFileSync5, writeFileSync as writeFileSync4, readdirSync, renameSync, symlinkSync as symlinkSync2, unlinkSync, readlinkSync } from "fs";
7435
+ import { existsSync as existsSync6, mkdirSync as mkdirSync6, readFileSync as readFileSync6, writeFileSync as writeFileSync4, readdirSync, renameSync, symlinkSync as symlinkSync2, unlinkSync, readlinkSync } from "fs";
7371
7436
  import { homedir as homedir5 } from "os";
7372
7437
  import { join as join8 } from "path";
7373
7438
  var log18 = forComponent("session");
@@ -7421,7 +7486,7 @@ var SessionStore = class {
7421
7486
  const path = join8(this.dir, `${id}.json`);
7422
7487
  if (!existsSync6(path)) return void 0;
7423
7488
  try {
7424
- return JSON.parse(readFileSync5(path, "utf8"));
7489
+ return JSON.parse(readFileSync6(path, "utf8"));
7425
7490
  } catch (e) {
7426
7491
  log18.debug(`unreadable session ${id} \u2014 ignoring`, e);
7427
7492
  return void 0;
@@ -7434,7 +7499,7 @@ var SessionStore = class {
7434
7499
  for (const f of readdirSync(this.dir)) {
7435
7500
  if (!f.endsWith(".json")) continue;
7436
7501
  try {
7437
- metas.push(JSON.parse(readFileSync5(join8(this.dir, f), "utf8")).meta);
7502
+ metas.push(JSON.parse(readFileSync6(join8(this.dir, f), "utf8")).meta);
7438
7503
  } catch (e) {
7439
7504
  log18.debug(`skipping unreadable session file ${f}`, e);
7440
7505
  }
@@ -7457,7 +7522,7 @@ function globalSessionLoad(idOrPrefix) {
7457
7522
  if (existsSync6(exact)) {
7458
7523
  try {
7459
7524
  const target = readlinkSync(exact);
7460
- return JSON.parse(readFileSync5(target, "utf8"));
7525
+ return JSON.parse(readFileSync6(target, "utf8"));
7461
7526
  } catch {
7462
7527
  return void 0;
7463
7528
  }
@@ -7468,7 +7533,7 @@ function globalSessionLoad(idOrPrefix) {
7468
7533
  const base = f.slice(0, -5);
7469
7534
  if (base.includes(idOrPrefix) || base.endsWith(idOrPrefix)) {
7470
7535
  const target = readlinkSync(join8(gd, f));
7471
- return JSON.parse(readFileSync5(target, "utf8"));
7536
+ return JSON.parse(readFileSync6(target, "utf8"));
7472
7537
  }
7473
7538
  }
7474
7539
  } catch {
@@ -7490,7 +7555,7 @@ function globalSessionList() {
7490
7555
  }
7491
7556
  continue;
7492
7557
  }
7493
- metas.push(JSON.parse(readFileSync5(target, "utf8")).meta);
7558
+ metas.push(JSON.parse(readFileSync6(target, "utf8")).meta);
7494
7559
  } catch {
7495
7560
  }
7496
7561
  }
@@ -8017,7 +8082,7 @@ function completePath(listDir, ref) {
8017
8082
  // cli/lineEditor.ts
8018
8083
  import { emitKeypressEvents } from "readline";
8019
8084
  import { spawnSync as spawnSync4 } from "child_process";
8020
- import { writeFileSync as writeFileSync7, readFileSync as readFileSync6, unlinkSync as unlinkSync2 } from "fs";
8085
+ import { writeFileSync as writeFileSync7, readFileSync as readFileSync7, unlinkSync as unlinkSync2 } from "fs";
8021
8086
  import { tmpdir as tmpdir2 } from "os";
8022
8087
  import { join as join11 } from "path";
8023
8088
 
@@ -9103,7 +9168,7 @@ function createLineEditor(out) {
9103
9168
  out.write("\x1B[?2004l");
9104
9169
  const r = spawnSync4(cmd, [...cargs, file], { stdio: "inherit" });
9105
9170
  if (r.status === 0) {
9106
- const text = readFileSync6(file, "utf8").replace(/\n$/, "");
9171
+ const text = readFileSync7(file, "utf8").replace(/\n$/, "");
9107
9172
  s.reset();
9108
9173
  if (text) s.insert(text);
9109
9174
  }
@@ -9876,7 +9941,7 @@ var err = (s) => process.stderr.write(s);
9876
9941
  var log22 = forComponent("cli");
9877
9942
  var VERSION = (() => {
9878
9943
  try {
9879
- return JSON.parse(readFileSync7(new URL("../package.json", import.meta.url), "utf8")).version ?? "?";
9944
+ return JSON.parse(readFileSync8(new URL("../package.json", import.meta.url), "utf8")).version ?? "?";
9880
9945
  } catch {
9881
9946
  return "?";
9882
9947
  }
@@ -10105,7 +10170,7 @@ function loadInstallEnv() {
10105
10170
  for (const name of [".env", ".env.local"]) {
10106
10171
  const file = join14(dir, name);
10107
10172
  if (!existsSync10(file)) continue;
10108
- for (const line of readFileSync7(file, "utf8").split("\n")) {
10173
+ for (const line of readFileSync8(file, "utf8").split("\n")) {
10109
10174
  const m = line.match(/^\s*(?:export\s+)?([A-Za-z_][A-Za-z0-9_]*)\s*=\s*(.*)$/);
10110
10175
  if (!m || m[1] in process.env) continue;
10111
10176
  let val = m[2].trim();
@@ -10706,7 +10771,7 @@ function readImageParts(cwd, line) {
10706
10771
  if (!mime) continue;
10707
10772
  const abs = ref.startsWith("~/") ? untilde(ref) : resolve3(cwd, ref);
10708
10773
  try {
10709
- parts.push(imagePart(`data:${mime};base64,${readFileSync7(abs).toString("base64")}`));
10774
+ parts.push(imagePart(`data:${mime};base64,${readFileSync8(abs).toString("base64")}`));
10710
10775
  } catch {
10711
10776
  }
10712
10777
  }
@@ -10951,7 +11016,7 @@ function initInstructions(cwd) {
10951
11016
  function persistSetting(cwd, key, value) {
10952
11017
  const path = join14(cwd, ".agent", "settings.json");
10953
11018
  try {
10954
- const obj = existsSync10(path) ? JSON.parse(readFileSync7(path, "utf8")) : {};
11019
+ const obj = existsSync10(path) ? JSON.parse(readFileSync8(path, "utf8")) : {};
10955
11020
  if (obj[key] === value) return;
10956
11021
  obj[key] = value;
10957
11022
  mkdirSync11(dirname4(path), { recursive: true });
@@ -11108,6 +11173,7 @@ async function repl(args, ai, cfg, cwd) {
11108
11173
  notify(e) {
11109
11174
  if (voiceIO && (e.kind === "thinking_delta" || e.kind === "turn_start")) return;
11110
11175
  if (e.kind === "text_delta" && voiceIO) {
11176
+ spinner.stop();
11111
11177
  voiceIO.speakDelta(e.message);
11112
11178
  editorRef?.suspend();
11113
11179
  voiceEcho(e.message);
@@ -11195,7 +11261,7 @@ async function repl(args, ai, cfg, cwd) {
11195
11261
  quickLook: {
11196
11262
  branch: () => {
11197
11263
  try {
11198
- const head = readFileSync7(join14(cwd, ".git", "HEAD"), "utf8").trim();
11264
+ const head = readFileSync8(join14(cwd, ".git", "HEAD"), "utf8").trim();
11199
11265
  return head.startsWith("ref: refs/heads/") ? `branch: ${head.slice("ref: refs/heads/".length)}` : `detached HEAD at ${head.slice(0, 12)}`;
11200
11266
  } catch {
11201
11267
  return "not a git repository";
@@ -11430,7 +11496,7 @@ Added entries are loadable now via the Skill/SlashCommand tools; removed ones ar
11430
11496
  </system-reminder>`;
11431
11497
  };
11432
11498
  const histPath = join14(cwd, ".agent", "history");
11433
- const history = existsSync10(histPath) ? readFileSync7(histPath, "utf8").split("\n").filter(Boolean).reverse().slice(0, 500) : [];
11499
+ const history = existsSync10(histPath) ? readFileSync8(histPath, "utf8").split("\n").filter(Boolean).reverse().slice(0, 500) : [];
11434
11500
  const remember = (line) => {
11435
11501
  try {
11436
11502
  mkdirSync11(join14(cwd, ".agent"), { recursive: true });
@@ -12723,11 +12789,13 @@ ${out}
12723
12789
  err(dim(" (voice needs --duplex on a TTY)\n"));
12724
12790
  return false;
12725
12791
  }
12726
- if (!VoiceIO.available()) {
12792
+ if (!process.env.AGENTX_VOICE_FAKE && !VoiceIO.available()) {
12727
12793
  err(dim(" (voice I/O off \u2014 set SONIOX_API_KEY, CARTESIA_API_KEY, CARTESIA_VOICE_ID to talk)\n"));
12728
12794
  return false;
12729
12795
  }
12796
+ const fakeVoice = process.env.AGENTX_VOICE_FAKE ? fakeVoiceParts(process.env.AGENTX_VOICE_FAKE) : null;
12730
12797
  voiceIO = new VoiceIO({
12798
+ ...fakeVoice ?? {},
12731
12799
  // No ack phrase by default: a fixed "Mm-hm," every turn reads robotic, Haiku's TTFT doesn't
12732
12800
  // need masking (~0.7-1.2s full turns), and the conversational register already opens with a
12733
12801
  // natural reaction. The mechanism (+ echo-leak guard) stays for slower voice models.