agent.libx.js 0.92.6 → 0.92.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/cli/cli.ts CHANGED
@@ -983,9 +983,24 @@ async function repl(args: Args, ai: ChatLike, cfg: Partial<AgentConfig>, cwd: st
983
983
  let duplexPersist: () => void = () => {}; // bound once the session exists (re-voice fires async)
984
984
  let duplexAccount: (data: any) => void = () => {}; // worker cost → session meta (bound below)
985
985
  // Workers are non-interactive: a permission 'ask' can't pop a menu mid-conversation (it would fight
986
- // the line editor for raw stdin). Auto-deny with a narratable reason the worker adapts or reports
987
- // back and the voice narrates it. Allow/deny rules (config, --allowedTools, --yes) apply as-is.
986
+ // the line editor for raw stdin). VOICE mode relays the ask through the conversation (park
987
+ // '[task asks]' re-voice spoken yes/no via AnswerTask; timeout → deny). Text duplex (and the
988
+ // relay's timeout path) auto-denies with a narratable reason — the worker adapts or reports back.
989
+ let permSeq = 0;
988
990
  const duplexAsk = async (call: ToolUse): Promise<{ decision: 'allow' | 'deny' }> => {
991
+ if (args.voice && dx) {
992
+ const hint = summarizeCall(call.name, call.args).slice(0, 80);
993
+ // NB: perm asks are keyed perm-N (PermissionPolicy.ask carries no task identity), so a
994
+ // cancelled task can't clean its parked perm question — bounded by askTimeoutMs → deny.
995
+ const id = `perm-${++permSeq}`;
996
+ err('\r\x1b[0J' + yellow(` ? worker asks to run ${call.name}`) + dim(` ${hint} — say yes/no (or it auto-denies)\n`));
997
+ editorRef?.redrawNow();
998
+ const a = await dx.parkQuestion(id, `Permission: may the background worker run ${call.name}${hint ? ` (${hint})` : ''}? Yes or no.`);
999
+ const allow = /^\s*(y(es|ep|eah)?|sure|ok(ay)?|allow|go|approved?|do it)\b/i.test(a);
1000
+ err('\r\x1b[0J' + dim(` ${allow ? '✓ allowed' : '⊘ denied'} ${call.name} (${a.trim() || 'no answer'})\n`));
1001
+ editorRef?.redrawNow();
1002
+ return { decision: allow ? 'allow' : 'deny' };
1003
+ }
989
1004
  err('\r\x1b[0J' + yellow(` ⊘ worker asked to run ${call.name} — auto-denied (no interactive approval in duplex; use --yes or --allowedTools)\n`));
990
1005
  editorRef?.redrawNow(); // background event at a live prompt — repaint below the notice
991
1006
  return { decision: 'deny' };
@@ -1073,7 +1088,7 @@ async function repl(args: Args, ai: ChatLike, cfg: Partial<AgentConfig>, cwd: st
1073
1088
  workerModel: agent.options.model,
1074
1089
  workerOptions,
1075
1090
  host,
1076
- ...(args.voice ? { voiceStyle: 'conversational' as const, progressUpdates: true } : {}), // voice: narrate throttled worker progress (dead air is worse than a short aside)
1091
+ ...(args.voice ? { voiceStyle: 'conversational' as const, progressUpdates: true, askRelay: true } : {}), // voice: progress asides + worker questions relayed through the conversation
1077
1092
  // Per-TASK checkpoint frames (the natural undo unit in duplex = one delegation): opened BEFORE
1078
1093
  // the worker spawns (post-spawn would race its first edits). `checkpoints` is bound below.
1079
1094
  onTaskStart: async (_id, label) => { await checkpoints.begin(label); },
package/dist/cli.js CHANGED
@@ -3445,11 +3445,17 @@ var DuplexAgentOptions = class {
3445
3445
  progressUpdates = false;
3446
3446
  /** Min ms between progress re-voices per task. */
3447
3447
  progressIntervalMs = 25e3;
3448
+ /** Relay worker questions (AskUserQuestion + permission asks via parkQuestion) through the VOICE:
3449
+ * the question re-voices as '[task <id> asks] …', the user answers conversationally, and the
3450
+ * voice model resolves it with the AnswerTask tool. Off → host.ask passthrough (text menus). */
3451
+ askRelay = false;
3452
+ /** Parked questions auto-resolve empty after this long (callers map '' to deny/best-judgment). */
3453
+ askTimeoutMs = 12e4;
3448
3454
  /** Host overrides for QuickLook lookups (keyed by `what`). The engine's defaults go through the
3449
3455
  * (possibly jailed) fs — e.g. `.git/**` is deny-listed, so the CLI supplies 'branch' itself. */
3450
3456
  quickLook;
3451
3457
  };
3452
- var VOICE_SYSTEM_PROMPT = 'You are a spoken voice assistant \u2014 the user HEARS everything you say. Use short sentences. One idea per sentence. No markdown, no bullet lists, no code blocks, no headings, no emoji.\nKeep turns SHORT \u2014 one to three sentences, then stop. Never lecture, enumerate cases, or add caveats unprompted. Conversation is a fast exchange: give the one thing asked, and let the user pull more if they want it.\nYou work in a pair: you talk, and a background worker with FULL access to the user\'s environment (files, shell, web) does the hands-on work. You can find out or do ANYTHING by calling `Delegate` with a clear, self-contained brief \u2014 so NEVER tell the user you can\'t see, access, or do something. Delegate and find out. When the user mentions their project, folder, files, or environment ("this project", "the current folder", "my code"), delegate IMMEDIATELY \u2014 do not ask for paths or details the worker can discover itself. Never pretend to have done the work or invent results \u2014 the worker\'s report is your only source.\nAfter calling Delegate, tell the user you are on it in one short sentence, then end your turn. Do not wait for the result.\nResults arrive later as events like "[task t1 completed] \u2026" or "[task t1 failed] \u2026". When one arrives, summarize it for the ear in one or two short sentences. "[task t1 progress] \u2026" events are interim status, NOT results \u2014 give at most a half-sentence aside ("still on it \u2014 running tests now") and end your turn. Never present progress as a finished result.\nNever read raw file paths, diffs, or code aloud verbatim.\nDo not fire a second Delegate for work already in flight \u2014 check `TaskStatus` first. Use `CancelTask` when the user asks to stop something.\nPRIORITY: when the user says goodbye or wants to end/finish/wrap up the session ("ok bye", "that\'s all", "let\'s finish", "let\'s end", "goodnight", "exit", "wrap up"), call `ExitSession` IMMEDIATELY \u2014 do not delegate, do not check status, just exit.\nFor TRIVIAL instant lookups only \u2014 current time, git branch, listing a folder, peeking at a small file \u2014 use `QuickLook` (instant, no task). Anything requiring searching, reasoning, running commands, or editing still goes through `Delegate`.\nNEVER claim to have stored, saved, or remembered something durably \u2014 you cannot. Anything the user wants persisted (their name, preferences, notes) must be Delegated so a worker writes it to memory.\nUser messages may arrive via speech-to-text and can carry transcription artifacts \u2014 odd words, cut-offs, homophones ("for you" vs "folder"). Read for INTENT, not surface text. If a message seems garbled or surprising, briefly confirm what they meant ("did you mean\u2026?") instead of answering the literal words.';
3458
+ var VOICE_SYSTEM_PROMPT = 'You are a spoken voice assistant \u2014 the user HEARS everything you say. Use short sentences. One idea per sentence. No markdown, no bullet lists, no code blocks, no headings, no emoji.\nKeep turns SHORT \u2014 one to three sentences, then stop. Never lecture, enumerate cases, or add caveats unprompted. Conversation is a fast exchange: give the one thing asked, and let the user pull more if they want it.\nYou work in a pair: you talk, and a background worker with FULL access to the user\'s environment (files, shell, web) does the hands-on work. You can find out or do ANYTHING by calling `Delegate` with a clear, self-contained brief \u2014 so NEVER tell the user you can\'t see, access, or do something. Delegate and find out. When the user mentions their project, folder, files, or environment ("this project", "the current folder", "my code"), delegate IMMEDIATELY \u2014 do not ask for paths or details the worker can discover itself. Never pretend to have done the work or invent results \u2014 the worker\'s report is your only source.\nAfter calling Delegate, tell the user you are on it in one short sentence, then end your turn. Do not wait for the result.\nResults arrive later as events like "[task t1 completed] \u2026" or "[task t1 failed] \u2026". When one arrives, summarize it for the ear in one or two short sentences. "[task t1 progress] \u2026" events are interim status, NOT results \u2014 give at most a half-sentence aside ("still on it \u2014 running tests now") and end your turn. Never present progress as a finished result.\nNever read raw file paths, diffs, or code aloud verbatim.\n"[task t1 asks] \u2026" events are QUESTIONS from a background task \u2014 relay to the user in your own words, short, then end your turn. When the user answers, call `AnswerTask` with that id and their answer. NEVER answer on the user\'s behalf for permissions or risky operations; if their reply is ambiguous, confirm first.\nDo not fire a second Delegate for work already in flight \u2014 check `TaskStatus` first. Use `CancelTask` when the user asks to stop something.\nPRIORITY: when the user says goodbye or wants to end/finish/wrap up the session ("ok bye", "that\'s all", "let\'s finish", "let\'s end", "goodnight", "exit", "wrap up"), call `ExitSession` IMMEDIATELY \u2014 do not delegate, do not check status, just exit.\nFor TRIVIAL instant lookups only \u2014 current time, git branch, listing a folder, peeking at a small file \u2014 use `QuickLook` (instant, no task). Anything requiring searching, reasoning, running commands, or editing still goes through `Delegate`.\nNEVER claim to have stored, saved, or remembered something durably \u2014 you cannot. Anything the user wants persisted (their name, preferences, notes) must be Delegated so a worker writes it to memory.\nUser messages may arrive via speech-to-text and can carry transcription artifacts \u2014 odd words, cut-offs, homophones ("for you" vs "folder"). Read for INTENT, not surface text. If a message seems garbled or surprising, briefly confirm what they meant ("did you mean\u2026?") instead of answering the literal words.';
3453
3459
  var VOICE_STYLE_CONVERSATIONAL = `Speak like a person in a live conversation, not an assistant reading a script. React first, then deliver: a quick impulsive beat ("oh nice", "hmm, hold on", "ah, got it") before the substance. Use contractions always. Vary sentence length \u2014 some very short. Light fillers and backchannels are fine ("mm-hm", "right", "let's see") but at most one per reply \u2014 never stack them. When you delegate, say it like a human would ("hang on, let me actually dig into that \u2014 gimme a minute") instead of announcing a task. When a result comes back, react to it like you just found out ("okay so \u2014 turns out\u2026"). Match the user's energy: a quick question gets a quick answer \u2014 a few words is a perfectly good turn. Prefer a short answer plus an offer ("want the details?") over covering everything. Never narrate your own mechanics (no "I will now delegate", no task ids out loud).`;
3454
3460
  var DuplexAgent = class {
3455
3461
  options;
@@ -3459,6 +3465,8 @@ var DuplexAgent = class {
3459
3465
  seq = 0;
3460
3466
  pendingEvents = [];
3461
3467
  flushQueued = false;
3468
+ /** Parked worker questions awaiting a (voice-relayed) user answer, keyed by ask id. */
3469
+ pendingAsks = /* @__PURE__ */ new Map();
3462
3470
  constructor(options) {
3463
3471
  this.options = { ...new DuplexAgentOptions(), ...options };
3464
3472
  const o = this.options;
@@ -3481,7 +3489,7 @@ Today's date: ${(/* @__PURE__ */ new Date()).toDateString()}.`,
3481
3489
  // no defaultTools() — the voice can only Delegate, never touch files itself. Set AFTER the
3482
3490
  // voiceOptions spread (addTools() would be clobbered by the first prepare()); extra voice
3483
3491
  // tools come in via voiceOptions.tools and are merged here.
3484
- tools: [...o.voiceOptions?.tools ?? [], this.delegateTool(), this.taskStatusTool(), this.cancelTaskTool(), this.quickLookTool()]
3492
+ tools: [...o.voiceOptions?.tools ?? [], this.delegateTool(), this.taskStatusTool(), this.cancelTaskTool(), this.quickLookTool(), this.answerTaskTool()]
3485
3493
  });
3486
3494
  }
3487
3495
  /** One user turn: the voice agent streams the reply (and may Delegate). Serialized with re-voice turns. */
@@ -3552,12 +3560,19 @@ ${recent}` : brief;
3552
3560
  report.output(chunk);
3553
3561
  }
3554
3562
  } : base;
3563
+ const relayAsk = async (q2) => {
3564
+ const opts = q2.options?.length ? ` Options: ${q2.options.map((x) => x.label).join(", ")}.` : "";
3565
+ const a = await this.parkQuestion(id, `${q2.question}${opts}`);
3566
+ return a || "(no answer from the user \u2014 use your best judgment and note the assumption)";
3567
+ };
3568
+ const workerHost = o.askRelay ? { ask: relayAsk } : o.host?.ask ? { ask: (q2) => o.host.ask(q2) } : void 0;
3555
3569
  const worker = new Agent({
3556
3570
  ai: o.ai,
3557
3571
  fs: o.fs,
3558
3572
  model: o.workerModel,
3559
3573
  ...o.workerOptions,
3560
3574
  // may override ai/fs/model/tools/… —
3575
+ ...workerHost ? { host: workerHost } : {},
3561
3576
  ...hooks ? { hooks } : {},
3562
3577
  signal: controller.signal
3563
3578
  // …but never the per-task cancellation signal
@@ -3606,7 +3621,35 @@ ${recent}` : brief;
3606
3621
  }
3607
3622
  };
3608
3623
  }
3624
+ /** Park a question under `askId` (a task id, or any unique key for permission asks): re-voices
3625
+ * '[task <id> asks] …' and resolves with the user's answer via AnswerTask — or '' on timeout/
3626
+ * task settle (callers map '' to deny / best-judgment). Workers never block forever. */
3627
+ parkQuestion(askId, question) {
3628
+ return new Promise((resolve4) => {
3629
+ let settled = false;
3630
+ const finish = (answer) => {
3631
+ if (settled) return;
3632
+ settled = true;
3633
+ clearTimeout(timer);
3634
+ this.pendingAsks.delete(askId);
3635
+ resolve4(answer);
3636
+ };
3637
+ const timer = setTimeout(() => {
3638
+ this.notify("task_ask_timeout", `task ${askId}: question timed out \u2014 proceeding without an answer`);
3639
+ finish("");
3640
+ }, this.options.askTimeoutMs);
3641
+ this.pendingAsks.set(askId, { question, resolve: finish });
3642
+ this.notify("task_ask", `task ${askId} asks: ${question}`, { id: askId, question });
3643
+ this.queueRevoice(`[task ${askId} asks] ${question}
3644
+ (Relay this to the user in your own words. When they answer, call AnswerTask with id "${askId}" and their answer.)`);
3645
+ });
3646
+ }
3647
+ /** Resolve any question a settling/cancelled task left parked (its answer can no longer matter). */
3648
+ dropAsk(id) {
3649
+ this.pendingAsks.get(id)?.resolve("");
3650
+ }
3609
3651
  onWorkerSettled(id, res) {
3652
+ this.dropAsk(id);
3610
3653
  const rec = this.tasks.get(id);
3611
3654
  if (res.finishReason === "aborted" || rec.status === "cancelled") {
3612
3655
  rec.status = "cancelled";
@@ -3626,6 +3669,7 @@ ${recent}` : brief;
3626
3669
  this.failTask(this.tasks.get(id), err2 instanceof Error ? err2.message : String(err2));
3627
3670
  }
3628
3671
  failTask(rec, msg) {
3672
+ this.dropAsk(rec.id);
3629
3673
  rec.status = "error";
3630
3674
  log6.warn(`task ${rec.id} failed: ${msg}`);
3631
3675
  this.notify("task_error", `task ${rec.id} (${rec.label}) failed: ${msg}`);
@@ -3718,6 +3762,23 @@ ${recent}` : brief;
3718
3762
  }
3719
3763
  };
3720
3764
  }
3765
+ answerTaskTool() {
3766
+ return {
3767
+ name: "AnswerTask",
3768
+ description: `Relay the user's answer to a pending question from a background task (the "[task <id> asks]" events). Pass the id from the event and the user's answer.`,
3769
+ parameters: {
3770
+ type: "object",
3771
+ required: ["id", "answer"],
3772
+ properties: { id: { type: "string" }, answer: { type: "string", description: "the user's answer, verbatim or faithfully summarized" } }
3773
+ },
3774
+ run: async ({ id, answer }) => {
3775
+ const ask = this.pendingAsks.get(String(id));
3776
+ if (!ask) return `No pending question for '${id}' \u2014 it may have been answered already or timed out.`;
3777
+ ask.resolve(String(answer ?? ""));
3778
+ return `Answer relayed \u2014 task ${id} resumes.`;
3779
+ }
3780
+ };
3781
+ }
3721
3782
  cancelTaskTool() {
3722
3783
  return {
3723
3784
  name: "CancelTask",
@@ -3805,6 +3866,18 @@ var VoiceEngineOptions = class {
3805
3866
  /** heuristic (non-AEC) energy barge-in tuning */
3806
3867
  bargeRmsMult = 2;
3807
3868
  bargeRmsFloor = 500;
3869
+ /** Overlap turn-taking (AEC tier, needs player.pause/resume) — human phone-call model:
3870
+ * onset → PAUSE (exact-sample hold, nothing lost); sustained overlap → cede (interrupt; the LLM
3871
+ * re-enters). Brief overlaps that die out (backchannels — "mm-hm", decided by DURATION, not
3872
+ * vocabulary) resume from the precise sample and are dropped. false disables. */
3873
+ overlapPause = true;
3874
+ /** sustained overlap ≥ this → cede the turn */
3875
+ overlapSustainMs = 350;
3876
+ /** quiet for this long while paused → resume, drop the interjection */
3877
+ overlapResumeMs = 700;
3878
+ /** energy floor for "overlap candidate" — must sit ABOVE typical room ambient (~110 rms measured;
3879
+ * ungated ambient re-arming the resume timer forever was a live wedge). User speech ≫ 300. */
3880
+ overlapRms = 300;
3808
3881
  };
3809
3882
  var VoiceEngine = class {
3810
3883
  options;
@@ -3835,6 +3908,13 @@ var VoiceEngine = class {
3835
3908
  // endpointed text held for the merge window
3836
3909
  pendingTimer = null;
3837
3910
  lastInterrupted = null;
3911
+ // overlap (pause) tier state — AEC + pause-capable sinks only
3912
+ pausedAt = 0;
3913
+ overlapLoud = 0;
3914
+ // loud chunks since pause (sustain must be real sound, not two clicks)
3915
+ overlapLastLoudAt = 0;
3916
+ // continuity guard: a gap re-arms the onset (sparse noise ≠ sustained speech)
3917
+ resumeTimer = null;
3838
3918
  constructor(options) {
3839
3919
  this.options = { ...new VoiceEngineOptions(), ...options };
3840
3920
  const o = this.options;
@@ -3882,6 +3962,7 @@ var VoiceEngine = class {
3882
3962
  this.drainTimer = null;
3883
3963
  }
3884
3964
  this.interrupted = false;
3965
+ this.resetOverlap(true);
3885
3966
  if (!this.speaking) this.player.markTurn();
3886
3967
  this.speaking = true;
3887
3968
  this.ctxOpen = true;
@@ -3916,6 +3997,10 @@ var VoiceEngine = class {
3916
3997
  this.drainTimer = null;
3917
3998
  return;
3918
3999
  }
4000
+ if (this.pausedAt) {
4001
+ this.drainTimer = setTimeout(settle, 250);
4002
+ return;
4003
+ }
3919
4004
  this.drainTimer = null;
3920
4005
  this.speaking = false;
3921
4006
  this.echoUntil = now() + 2500;
@@ -3947,6 +4032,7 @@ var VoiceEngine = class {
3947
4032
  clearTimeout(this.drainTimer);
3948
4033
  this.drainTimer = null;
3949
4034
  }
4035
+ this.resetOverlap(false);
3950
4036
  const heardChars = Math.round(Math.max(0, this.player.playedMs()) / 1e3 * 15);
3951
4037
  if (this.reply) this.lastInterrupted = { full: this.reply, heard: this.reply.slice(0, heardChars) };
3952
4038
  this.speaking = false;
@@ -3961,6 +4047,7 @@ var VoiceEngine = class {
3961
4047
  this.setState("listening");
3962
4048
  }
3963
4049
  stop() {
4050
+ if (this.resumeTimer) clearTimeout(this.resumeTimer);
3964
4051
  if (this.pendingTimer) clearTimeout(this.pendingTimer);
3965
4052
  if (this.drainTimer) clearTimeout(this.drainTimer);
3966
4053
  this.stt.stop();
@@ -3986,12 +4073,11 @@ var VoiceEngine = class {
3986
4073
  genuine(text) {
3987
4074
  const total = this.words(text).length;
3988
4075
  const novel = this.novelWords(text).length;
3989
- if (!novel) return false;
3990
- return novel >= 2 || novel / Math.max(1, total) > 0.5;
4076
+ return novel > 0 && novel / Math.max(1, total) > 0.5;
3991
4077
  }
3992
4078
  handlePartial(text) {
3993
4079
  if (this.speaking) {
3994
- const barge = this.usingAec ? this.genuine(text) : this.novelWords(text).length >= (this.suspectUntil ? 1 : 2);
4080
+ const barge = this.overlapCapable ? false : this.usingAec ? this.genuine(text) : this.novelWords(text).length >= (this.suspectUntil ? 1 : 2);
3995
4081
  if (barge) {
3996
4082
  const phase = this.ctxOpen ? "speaking" : "drain";
3997
4083
  this.interrupt();
@@ -4008,6 +4094,10 @@ var VoiceEngine = class {
4008
4094
  if (!this.echoActive() || (this.usingAec ? this.genuine(text) : this.novelWords(text).length >= 1)) this.options.onPartial(text);
4009
4095
  }
4010
4096
  handleUtterance(text) {
4097
+ if (this.speaking && this.ctxOpen && this.overlapCapable) {
4098
+ this.stt.reset();
4099
+ return;
4100
+ }
4011
4101
  if (this.echoActive() && (this.usingAec ? !this.genuine(text) : this.novelWords(text).length < 2)) {
4012
4102
  this.stt.reset();
4013
4103
  return;
@@ -4031,9 +4121,63 @@ var VoiceEngine = class {
4031
4121
  this.pendingUtt = "";
4032
4122
  if (text) this.options.onUtterance(text);
4033
4123
  }
4124
+ get overlapCapable() {
4125
+ return this.usingAec && this.options.overlapPause && !!this.player.pause && !!this.player.resume;
4126
+ }
4127
+ /** Overlap turn-taking (AEC tier): onset → pause (exact-sample hold); sustained → cede; died out
4128
+ * → resume. No vocabulary anywhere — duration and persistence decide (backchannels are short
4129
+ * and stop). Nothing is lost across a pause, so a false positive costs only a brief hold. */
4130
+ handleOverlap(rms) {
4131
+ const o = this.options;
4132
+ if (!this.speaking || !this.overlapCapable) return;
4133
+ if (rms < o.overlapRms) return;
4134
+ const t = now();
4135
+ if (!this.pausedAt) {
4136
+ this.overlapLoud = t - this.overlapLastLoudAt <= 60 ? this.overlapLoud + 1 : 1;
4137
+ this.overlapLastLoudAt = t;
4138
+ if (this.overlapLoud < 3) return;
4139
+ this.pausedAt = t;
4140
+ this.player.pause();
4141
+ this.armResume();
4142
+ return;
4143
+ }
4144
+ if (t - this.overlapLastLoudAt > 300) {
4145
+ this.pausedAt = t;
4146
+ this.overlapLoud = 1;
4147
+ this.overlapLastLoudAt = t;
4148
+ this.armResume();
4149
+ return;
4150
+ }
4151
+ this.overlapLastLoudAt = t;
4152
+ this.overlapLoud++;
4153
+ if (t - this.pausedAt >= o.overlapSustainMs && this.overlapLoud >= 4) {
4154
+ const phase = this.ctxOpen ? "speaking" : "drain";
4155
+ this.interrupt();
4156
+ this.options.onBargeIn(phase);
4157
+ return;
4158
+ }
4159
+ this.armResume();
4160
+ }
4161
+ armResume() {
4162
+ if (this.resumeTimer) clearTimeout(this.resumeTimer);
4163
+ this.resumeTimer = setTimeout(() => {
4164
+ this.resumeTimer = null;
4165
+ if (!this.pausedAt) return;
4166
+ this.resetOverlap(true);
4167
+ }, this.options.overlapResumeMs);
4168
+ }
4169
+ resetOverlap(resume) {
4170
+ if (this.resumeTimer) {
4171
+ clearTimeout(this.resumeTimer);
4172
+ this.resumeTimer = null;
4173
+ }
4174
+ if (this.pausedAt && resume) this.player.resume?.();
4175
+ this.pausedAt = 0;
4176
+ this.overlapLoud = 0;
4177
+ }
4034
4178
  /** energy two-stage barge-in (heuristic tier only): spike over echo baseline → pause + confirm via STT */
4035
4179
  handleLevel(rms) {
4036
- if (this.usingAec) return;
4180
+ if (this.usingAec) return this.handleOverlap(rms);
4037
4181
  if (!this.speaking) {
4038
4182
  this.baseline = 0;
4039
4183
  this.hot = 0;
@@ -4983,6 +5127,102 @@ var NodeMicSource = class {
4983
5127
  }, 500).unref?.();
4984
5128
  }
4985
5129
  };
5130
+ var AecDuplexAudio = class {
5131
+ constructor(bin) {
5132
+ this.bin = bin;
5133
+ }
5134
+ bin;
5135
+ aec = true;
5136
+ proc = null;
5137
+ stopped = false;
5138
+ bytesWritten = 0;
5139
+ startedAt = 0;
5140
+ // --- AudioSource ---
5141
+ start(onChunk) {
5142
+ this.proc = spawn2(this.bin, [], { stdio: ["pipe", "pipe", "ignore"] });
5143
+ this.proc.stdin.on("error", () => {
5144
+ });
5145
+ this.proc.on("exit", (c) => {
5146
+ if (c && !this.stopped) log12.error(`aec duplex audio exited (${c}) \u2014 check mic permission / MIC_AEC=0`);
5147
+ });
5148
+ this.proc.stdout.on("data", (chunk) => onChunk(chunk));
5149
+ }
5150
+ stop() {
5151
+ this.stopped = true;
5152
+ const p = this.proc;
5153
+ this.proc = null;
5154
+ if (!p) return;
5155
+ p.kill("SIGTERM");
5156
+ setTimeout(() => {
5157
+ try {
5158
+ p.kill("SIGKILL");
5159
+ } catch {
5160
+ }
5161
+ }, 500).unref?.();
5162
+ }
5163
+ // --- AudioSink (frame writer; same played/drain byte-math as the ffplay Player) ---
5164
+ frame(payload) {
5165
+ const stdin = this.proc?.stdin;
5166
+ if (!stdin || stdin.destroyed) return;
5167
+ const hdr = Buffer.alloc(4);
5168
+ hdr.writeUInt32LE(payload ? payload.length : 0);
5169
+ stdin.write(hdr);
5170
+ if (payload?.length) stdin.write(payload);
5171
+ }
5172
+ markTurn() {
5173
+ this.frame(null);
5174
+ this.bytesWritten = 0;
5175
+ this.startedAt = 0;
5176
+ this.pausedSince = 0;
5177
+ this.pausedAccum = 0;
5178
+ }
5179
+ write(chunk) {
5180
+ if (!this.startedAt) this.startedAt = now4();
5181
+ this.bytesWritten += chunk.length;
5182
+ this.frame(chunk);
5183
+ }
5184
+ playedMs() {
5185
+ return this.startedAt ? now4() - this.startedAt - this.pausedMs() : 0;
5186
+ }
5187
+ drainMs() {
5188
+ if (!this.startedAt) return 0;
5189
+ const queuedMs = this.bytesWritten / (TTS_SAMPLE_RATE * 2) * 1e3;
5190
+ return Math.max(0, queuedMs - (now4() - this.startedAt - this.pausedMs()));
5191
+ }
5192
+ /** barge-in: silence NOW (in-band flush) — the capture side keeps running */
5193
+ kill() {
5194
+ this.frame(null);
5195
+ this.bytesWritten = 0;
5196
+ this.startedAt = 0;
5197
+ this.pausedSince = 0;
5198
+ this.pausedAccum = 0;
5199
+ }
5200
+ /** overlap trail-off: exact-sample PAUSE (len==0xFFFFFFFF) / RESUME (len==0xFFFFFFFE) frames */
5201
+ pausedSince = 0;
5202
+ pausedAccum = 0;
5203
+ ctl(code) {
5204
+ const stdin = this.proc?.stdin;
5205
+ if (!stdin || stdin.destroyed) return;
5206
+ const f = Buffer.alloc(4);
5207
+ f.writeUInt32LE(code, 0);
5208
+ stdin.write(f);
5209
+ }
5210
+ pause() {
5211
+ if (this.pausedSince) return;
5212
+ this.pausedSince = now4();
5213
+ this.ctl(4294967295);
5214
+ }
5215
+ resume() {
5216
+ if (!this.pausedSince) return;
5217
+ this.pausedAccum += now4() - this.pausedSince;
5218
+ this.pausedSince = 0;
5219
+ this.ctl(4294967294);
5220
+ }
5221
+ /** total paused time this turn — excluded from played/drain math (the tape held still) */
5222
+ pausedMs() {
5223
+ return this.pausedAccum + (this.pausedSince ? now4() - this.pausedSince : 0);
5224
+ }
5225
+ };
4986
5226
  var VoiceIOOptions = class extends VoiceEngineOptions {
4987
5227
  sonioxApiKey = process.env.SONIOX_API_KEY ?? "";
4988
5228
  cartesiaApiKey = process.env.CARTESIA_API_KEY ?? "";
@@ -4991,11 +5231,13 @@ var VoiceIOOptions = class extends VoiceEngineOptions {
4991
5231
  var VoiceIO = class extends VoiceEngine {
4992
5232
  constructor(options) {
4993
5233
  const o = { ...new VoiceIOOptions(), ...options };
5234
+ const bin = !o.stt || !o.player ? resolveAecBinary() : null;
5235
+ const duplex = bin ? new AecDuplexAudio(bin) : null;
4994
5236
  super({
4995
5237
  ...o,
4996
- stt: o.stt ?? new SonioxSTT({ auth: o.sonioxApiKey, source: new NodeMicSource() }),
5238
+ stt: o.stt ?? new SonioxSTT({ auth: o.sonioxApiKey, source: duplex ?? new NodeMicSource() }),
4997
5239
  tts: o.tts ?? new CartesiaTTS({ auth: o.cartesiaApiKey, voiceId: o.cartesiaVoiceId }),
4998
- player: o.player ?? new Player(),
5240
+ player: o.player ?? duplex ?? new Player(),
4999
5241
  bargeRmsMult: Number(process.env.BARGE_RMS_MULT || o.bargeRmsMult),
5000
5242
  bargeRmsFloor: Number(process.env.BARGE_RMS_FLOOR || o.bargeRmsFloor)
5001
5243
  });
@@ -7720,7 +7962,21 @@ async function repl(args, ai, cfg, cwd) {
7720
7962
  };
7721
7963
  let duplexAccount = () => {
7722
7964
  };
7965
+ let permSeq = 0;
7723
7966
  const duplexAsk = async (call) => {
7967
+ if (args.voice && dx) {
7968
+ const hint = summarizeCall(call.name, call.args).slice(0, 80);
7969
+ const id = `perm-${++permSeq}`;
7970
+ err("\r\x1B[0J" + yellow(` ? worker asks to run ${call.name}`) + dim(` ${hint} \u2014 say yes/no (or it auto-denies)
7971
+ `));
7972
+ editorRef?.redrawNow();
7973
+ const a = await dx.parkQuestion(id, `Permission: may the background worker run ${call.name}${hint ? ` (${hint})` : ""}? Yes or no.`);
7974
+ const allow = /^\s*(y(es|ep|eah)?|sure|ok(ay)?|allow|go|approved?|do it)\b/i.test(a);
7975
+ err("\r\x1B[0J" + dim(` ${allow ? "\u2713 allowed" : "\u2298 denied"} ${call.name} (${a.trim() || "no answer"})
7976
+ `));
7977
+ editorRef?.redrawNow();
7978
+ return { decision: allow ? "allow" : "deny" };
7979
+ }
7724
7980
  err("\r\x1B[0J" + yellow(` \u2298 worker asked to run ${call.name} \u2014 auto-denied (no interactive approval in duplex; use --yes or --allowedTools)
7725
7981
  `));
7726
7982
  editorRef?.redrawNow();
@@ -7794,8 +8050,8 @@ async function repl(args, ai, cfg, cwd) {
7794
8050
  workerModel: agent.options.model,
7795
8051
  workerOptions,
7796
8052
  host,
7797
- ...args.voice ? { voiceStyle: "conversational", progressUpdates: true } : {},
7798
- // voice: narrate throttled worker progress (dead air is worse than a short aside)
8053
+ ...args.voice ? { voiceStyle: "conversational", progressUpdates: true, askRelay: true } : {},
8054
+ // voice: progress asides + worker questions relayed through the conversation
7799
8055
  // Per-TASK checkpoint frames (the natural undo unit in duplex = one delegation): opened BEFORE
7800
8056
  // the worker spawns (post-spawn would race its first edits). `checkpoints` is bound below.
7801
8057
  onTaskStart: async (_id, label) => {