agent.libx.js 0.92.7 → 0.92.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.ts CHANGED
@@ -602,6 +602,12 @@ declare class DuplexAgentOptions {
602
602
  progressUpdates: boolean;
603
603
  /** Min ms between progress re-voices per task. */
604
604
  progressIntervalMs: number;
605
+ /** Relay worker questions (AskUserQuestion + permission asks via parkQuestion) through the VOICE:
606
+ * the question re-voices as '[task <id> asks] …', the user answers conversationally, and the
607
+ * voice model resolves it with the AnswerTask tool. Off → host.ask passthrough (text menus). */
608
+ askRelay: boolean;
609
+ /** Parked questions auto-resolve empty after this long (callers map '' to deny/best-judgment). */
610
+ askTimeoutMs: number;
605
611
  /** Host overrides for QuickLook lookups (keyed by `what`). The engine's defaults go through the
606
612
  * (possibly jailed) fs — e.g. `.git/**` is deny-listed, so the CLI supplies 'branch' itself. */
607
613
  quickLook?: Record<string, (path?: string) => string | Promise<string>>;
@@ -621,6 +627,11 @@ declare class DuplexAgent {
621
627
  private seq;
622
628
  private pendingEvents;
623
629
  private flushQueued;
630
+ /** Parked worker questions awaiting a (voice-relayed) user answer, keyed by ask id. */
631
+ readonly pendingAsks: Map<string, {
632
+ question: string;
633
+ resolve: (answer: string) => void;
634
+ }>;
624
635
  constructor(options?: Partial<DuplexAgentOptions>);
625
636
  /** One user turn: the voice agent streams the reply (and may Delegate). Serialized with re-voice turns. */
626
637
  send(content: MessageContent): Promise<RunResult>;
@@ -640,6 +651,12 @@ declare class DuplexAgent {
640
651
  * (pre records the in-flight call; a self-cleaning timer narrates "still inside Bash — 70s").
641
652
  * Completion supersedes: nothing is emitted once the task has settled. */
642
653
  private progressReporter;
654
+ /** Park a question under `askId` (a task id, or any unique key for permission asks): re-voices
655
+ * '[task <id> asks] …' and resolves with the user's answer via AnswerTask — or '' on timeout/
656
+ * task settle (callers map '' to deny / best-judgment). Workers never block forever. */
657
+ parkQuestion(askId: string, question: string): Promise<string>;
658
+ /** Resolve any question a settling/cancelled task left parked (its answer can no longer matter). */
659
+ private dropAsk;
643
660
  private onWorkerSettled;
644
661
  private onWorkerFailed;
645
662
  private failTask;
@@ -649,6 +666,7 @@ declare class DuplexAgent {
649
666
  * fs-only (no shell; the engine is VFS-abstracted): time, git branch (.git/HEAD read), ls, file
650
667
  * head. Output is hard-capped so a lookup can never bloat the skinny voice context. */
651
668
  private quickLookTool;
669
+ private answerTaskTool;
652
670
  private cancelTaskTool;
653
671
  }
654
672
 
@@ -837,6 +855,7 @@ declare class VoiceEngine {
837
855
  private pausedAt;
838
856
  private overlapLoud;
839
857
  private overlapLastLoudAt;
858
+ private loudTimes;
840
859
  private resumeTimer;
841
860
  constructor(options?: Partial<VoiceEngineOptions>);
842
861
  start(): Promise<void>;
package/dist/index.js CHANGED
@@ -3545,11 +3545,17 @@ var DuplexAgentOptions = class {
3545
3545
  progressUpdates = false;
3546
3546
  /** Min ms between progress re-voices per task. */
3547
3547
  progressIntervalMs = 25e3;
3548
+ /** Relay worker questions (AskUserQuestion + permission asks via parkQuestion) through the VOICE:
3549
+ * the question re-voices as '[task <id> asks] …', the user answers conversationally, and the
3550
+ * voice model resolves it with the AnswerTask tool. Off → host.ask passthrough (text menus). */
3551
+ askRelay = false;
3552
+ /** Parked questions auto-resolve empty after this long (callers map '' to deny/best-judgment). */
3553
+ askTimeoutMs = 12e4;
3548
3554
  /** Host overrides for QuickLook lookups (keyed by `what`). The engine's defaults go through the
3549
3555
  * (possibly jailed) fs — e.g. `.git/**` is deny-listed, so the CLI supplies 'branch' itself. */
3550
3556
  quickLook;
3551
3557
  };
3552
- var VOICE_SYSTEM_PROMPT = 'You are a spoken voice assistant \u2014 the user HEARS everything you say. Use short sentences. One idea per sentence. No markdown, no bullet lists, no code blocks, no headings, no emoji.\nKeep turns SHORT \u2014 one to three sentences, then stop. Never lecture, enumerate cases, or add caveats unprompted. Conversation is a fast exchange: give the one thing asked, and let the user pull more if they want it.\nYou work in a pair: you talk, and a background worker with FULL access to the user\'s environment (files, shell, web) does the hands-on work. You can find out or do ANYTHING by calling `Delegate` with a clear, self-contained brief \u2014 so NEVER tell the user you can\'t see, access, or do something. Delegate and find out. When the user mentions their project, folder, files, or environment ("this project", "the current folder", "my code"), delegate IMMEDIATELY \u2014 do not ask for paths or details the worker can discover itself. Never pretend to have done the work or invent results \u2014 the worker\'s report is your only source.\nAfter calling Delegate, tell the user you are on it in one short sentence, then end your turn. Do not wait for the result.\nResults arrive later as events like "[task t1 completed] \u2026" or "[task t1 failed] \u2026". When one arrives, summarize it for the ear in one or two short sentences. "[task t1 progress] \u2026" events are interim status, NOT results \u2014 give at most a half-sentence aside ("still on it \u2014 running tests now") and end your turn. Never present progress as a finished result.\nNever read raw file paths, diffs, or code aloud verbatim.\nDo not fire a second Delegate for work already in flight \u2014 check `TaskStatus` first. Use `CancelTask` when the user asks to stop something.\nPRIORITY: when the user says goodbye or wants to end/finish/wrap up the session ("ok bye", "that\'s all", "let\'s finish", "let\'s end", "goodnight", "exit", "wrap up"), call `ExitSession` IMMEDIATELY \u2014 do not delegate, do not check status, just exit.\nFor TRIVIAL instant lookups only \u2014 current time, git branch, listing a folder, peeking at a small file \u2014 use `QuickLook` (instant, no task). Anything requiring searching, reasoning, running commands, or editing still goes through `Delegate`.\nNEVER claim to have stored, saved, or remembered something durably \u2014 you cannot. Anything the user wants persisted (their name, preferences, notes) must be Delegated so a worker writes it to memory.\nUser messages may arrive via speech-to-text and can carry transcription artifacts \u2014 odd words, cut-offs, homophones ("for you" vs "folder"). Read for INTENT, not surface text. If a message seems garbled or surprising, briefly confirm what they meant ("did you mean\u2026?") instead of answering the literal words.';
3558
+ var VOICE_SYSTEM_PROMPT = 'You are a spoken voice assistant \u2014 the user HEARS everything you say. Use short sentences. One idea per sentence. No markdown, no bullet lists, no code blocks, no headings, no emoji.\nKeep turns SHORT \u2014 one to three sentences, then stop. Never lecture, enumerate cases, or add caveats unprompted. Conversation is a fast exchange: give the one thing asked, and let the user pull more if they want it.\nYou work in a pair: you talk, and a background worker with FULL access to the user\'s environment (files, shell, web) does the hands-on work. You can find out or do ANYTHING by calling `Delegate` with a clear, self-contained brief \u2014 so NEVER tell the user you can\'t see, access, or do something. Delegate and find out. When the user mentions their project, folder, files, or environment ("this project", "the current folder", "my code"), delegate IMMEDIATELY \u2014 do not ask for paths or details the worker can discover itself. Never pretend to have done the work or invent results \u2014 the worker\'s report is your only source.\nAfter calling Delegate, tell the user you are on it in one short sentence, then end your turn. Do not wait for the result.\nResults arrive later as events like "[task t1 completed] \u2026" or "[task t1 failed] \u2026". When one arrives, summarize it for the ear in one or two short sentences. "[task t1 progress] \u2026" events are interim status, NOT results \u2014 give at most a half-sentence aside ("still on it \u2014 running tests now") and end your turn. Never present progress as a finished result.\nNever read raw file paths, diffs, or code aloud verbatim.\n"[task t1 asks] \u2026" events are QUESTIONS from a background task \u2014 relay to the user in your own words, short, then end your turn. When the user answers, call `AnswerTask` with that id and their answer. NEVER answer on the user\'s behalf for permissions or risky operations; if their reply is ambiguous, confirm first.\nDo not fire a second Delegate for work already in flight \u2014 check `TaskStatus` first. Use `CancelTask` when the user asks to stop something.\nPRIORITY: when the user says goodbye or wants to end/finish/wrap up the session ("ok bye", "that\'s all", "let\'s finish", "let\'s end", "goodnight", "exit", "wrap up"), call `ExitSession` IMMEDIATELY \u2014 do not delegate, do not check status, just exit.\nFor TRIVIAL instant lookups only \u2014 current time, git branch, listing a folder, peeking at a small file \u2014 use `QuickLook` (instant, no task). Anything requiring searching, reasoning, running commands, or editing still goes through `Delegate`.\nNEVER claim to have stored, saved, or remembered something durably \u2014 you cannot. Anything the user wants persisted (their name, preferences, notes) must be Delegated so a worker writes it to memory.\nUser messages may arrive via speech-to-text and can carry transcription artifacts \u2014 odd words, cut-offs, homophones ("for you" vs "folder"). Read for INTENT, not surface text. If a message seems garbled or surprising, briefly confirm what they meant ("did you mean\u2026?") instead of answering the literal words.';
3553
3559
  var VOICE_STYLE_CONVERSATIONAL = `Speak like a person in a live conversation, not an assistant reading a script. React first, then deliver: a quick impulsive beat ("oh nice", "hmm, hold on", "ah, got it") before the substance. Use contractions always. Vary sentence length \u2014 some very short. Light fillers and backchannels are fine ("mm-hm", "right", "let's see") but at most one per reply \u2014 never stack them. When you delegate, say it like a human would ("hang on, let me actually dig into that \u2014 gimme a minute") instead of announcing a task. When a result comes back, react to it like you just found out ("okay so \u2014 turns out\u2026"). Match the user's energy: a quick question gets a quick answer \u2014 a few words is a perfectly good turn. Prefer a short answer plus an offer ("want the details?") over covering everything. Never narrate your own mechanics (no "I will now delegate", no task ids out loud).`;
3554
3560
  var DuplexAgent = class {
3555
3561
  options;
@@ -3559,6 +3565,8 @@ var DuplexAgent = class {
3559
3565
  seq = 0;
3560
3566
  pendingEvents = [];
3561
3567
  flushQueued = false;
3568
+ /** Parked worker questions awaiting a (voice-relayed) user answer, keyed by ask id. */
3569
+ pendingAsks = /* @__PURE__ */ new Map();
3562
3570
  constructor(options) {
3563
3571
  this.options = { ...new DuplexAgentOptions(), ...options };
3564
3572
  const o = this.options;
@@ -3581,7 +3589,7 @@ Today's date: ${(/* @__PURE__ */ new Date()).toDateString()}.`,
3581
3589
  // no defaultTools() — the voice can only Delegate, never touch files itself. Set AFTER the
3582
3590
  // voiceOptions spread (addTools() would be clobbered by the first prepare()); extra voice
3583
3591
  // tools come in via voiceOptions.tools and are merged here.
3584
- tools: [...o.voiceOptions?.tools ?? [], this.delegateTool(), this.taskStatusTool(), this.cancelTaskTool(), this.quickLookTool()]
3592
+ tools: [...o.voiceOptions?.tools ?? [], this.delegateTool(), this.taskStatusTool(), this.cancelTaskTool(), this.quickLookTool(), this.answerTaskTool()]
3585
3593
  });
3586
3594
  }
3587
3595
  /** One user turn: the voice agent streams the reply (and may Delegate). Serialized with re-voice turns. */
@@ -3652,7 +3660,12 @@ ${recent}` : brief;
3652
3660
  report.output(chunk);
3653
3661
  }
3654
3662
  } : base;
3655
- const workerHost = o.host?.ask ? { ask: (q) => o.host.ask(q) } : void 0;
3663
+ const relayAsk = async (q) => {
3664
+ const opts = q.options?.length ? ` Options: ${q.options.map((x) => x.label).join(", ")}.` : "";
3665
+ const a = await this.parkQuestion(id, `${q.question}${opts}`);
3666
+ return a || "(no answer from the user \u2014 use your best judgment and note the assumption)";
3667
+ };
3668
+ const workerHost = o.askRelay ? { ask: relayAsk } : o.host?.ask ? { ask: (q) => o.host.ask(q) } : void 0;
3656
3669
  const worker = new Agent({
3657
3670
  ai: o.ai,
3658
3671
  fs: o.fs,
@@ -3676,6 +3689,7 @@ ${recent}` : brief;
3676
3689
  let steps = 0;
3677
3690
  let inflight = null;
3678
3691
  const due = () => {
3692
+ if (this.pendingAsks.size) return void 0;
3679
3693
  const rec = this.tasks.get(id);
3680
3694
  return rec && rec.status === "running" && Date.now() - lastAt >= this.options.progressIntervalMs ? rec : void 0;
3681
3695
  };
@@ -3708,7 +3722,35 @@ ${recent}` : brief;
3708
3722
  }
3709
3723
  };
3710
3724
  }
3725
+ /** Park a question under `askId` (a task id, or any unique key for permission asks): re-voices
3726
+ * '[task <id> asks] …' and resolves with the user's answer via AnswerTask — or '' on timeout/
3727
+ * task settle (callers map '' to deny / best-judgment). Workers never block forever. */
3728
+ parkQuestion(askId, question) {
3729
+ return new Promise((resolve) => {
3730
+ let settled = false;
3731
+ const finish = (answer) => {
3732
+ if (settled) return;
3733
+ settled = true;
3734
+ clearTimeout(timer);
3735
+ this.pendingAsks.delete(askId);
3736
+ resolve(answer);
3737
+ };
3738
+ const timer = setTimeout(() => {
3739
+ this.notify("task_ask_timeout", `task ${askId}: question timed out \u2014 proceeding without an answer`);
3740
+ finish("");
3741
+ }, this.options.askTimeoutMs);
3742
+ this.pendingAsks.set(askId, { question, resolve: finish });
3743
+ this.notify("task_ask", `task ${askId} asks: ${question}`, { id: askId, question });
3744
+ this.queueRevoice(`[task ${askId} asks] ${question}
3745
+ (Relay this to the user in your own words. When they answer, call AnswerTask with id "${askId}" and their answer.)`);
3746
+ });
3747
+ }
3748
+ /** Resolve any question a settling/cancelled task left parked (its answer can no longer matter). */
3749
+ dropAsk(id) {
3750
+ this.pendingAsks.get(id)?.resolve("");
3751
+ }
3711
3752
  onWorkerSettled(id, res) {
3753
+ this.dropAsk(id);
3712
3754
  const rec = this.tasks.get(id);
3713
3755
  if (res.finishReason === "aborted" || rec.status === "cancelled") {
3714
3756
  rec.status = "cancelled";
@@ -3728,6 +3770,7 @@ ${recent}` : brief;
3728
3770
  this.failTask(this.tasks.get(id), err instanceof Error ? err.message : String(err));
3729
3771
  }
3730
3772
  failTask(rec, msg) {
3773
+ this.dropAsk(rec.id);
3731
3774
  rec.status = "error";
3732
3775
  log7.warn(`task ${rec.id} failed: ${msg}`);
3733
3776
  this.notify("task_error", `task ${rec.id} (${rec.label}) failed: ${msg}`);
@@ -3820,6 +3863,23 @@ ${recent}` : brief;
3820
3863
  }
3821
3864
  };
3822
3865
  }
3866
+ answerTaskTool() {
3867
+ return {
3868
+ name: "AnswerTask",
3869
+ description: `Relay the user's answer to a pending question from a background task (the "[task <id> asks]" events). Pass the id from the event and the user's answer.`,
3870
+ parameters: {
3871
+ type: "object",
3872
+ required: ["id", "answer"],
3873
+ properties: { id: { type: "string" }, answer: { type: "string", description: "the user's answer, verbatim or faithfully summarized" } }
3874
+ },
3875
+ run: async ({ id, answer }) => {
3876
+ const ask = this.pendingAsks.get(String(id));
3877
+ if (!ask) return `No pending question for '${id}' \u2014 it may have been answered already or timed out.`;
3878
+ ask.resolve(String(answer ?? ""));
3879
+ return `Answer relayed \u2014 task ${id} resumes.`;
3880
+ }
3881
+ };
3882
+ }
3823
3883
  cancelTaskTool() {
3824
3884
  return {
3825
3885
  name: "CancelTask",
@@ -4008,7 +4068,7 @@ var VoiceEngineOptions = class {
4008
4068
  * vocabulary) resume from the precise sample and are dropped. false disables. */
4009
4069
  overlapPause = true;
4010
4070
  /** sustained overlap ≥ this → cede the turn */
4011
- overlapSustainMs = 350;
4071
+ overlapSustainMs = 450;
4012
4072
  /** quiet for this long while paused → resume, drop the interjection */
4013
4073
  overlapResumeMs = 700;
4014
4074
  /** energy floor for "overlap candidate" — must sit ABOVE typical room ambient (~110 rms measured;
@@ -4050,6 +4110,8 @@ var VoiceEngine = class {
4050
4110
  // loud chunks since pause (sustain must be real sound, not two clicks)
4051
4111
  overlapLastLoudAt = 0;
4052
4112
  // continuity guard: a gap re-arms the onset (sparse noise ≠ sustained speech)
4113
+ loudTimes = [];
4114
+ // recent loud-chunk timestamps (sliding onset window)
4053
4115
  resumeTimer = null;
4054
4116
  constructor(options) {
4055
4117
  this.options = { ...new VoiceEngineOptions(), ...options };
@@ -4269,14 +4331,18 @@ var VoiceEngine = class {
4269
4331
  if (rms < o.overlapRms) return;
4270
4332
  const t = now();
4271
4333
  if (!this.pausedAt) {
4334
+ this.loudTimes = this.loudTimes.filter((x) => t - x < 400);
4335
+ this.loudTimes.push(t);
4336
+ if (this.loudTimes.length < 2) return;
4337
+ this.loudTimes = [];
4272
4338
  this.pausedAt = t;
4273
- this.overlapLoud = 1;
4339
+ this.overlapLoud = 2;
4274
4340
  this.overlapLastLoudAt = t;
4275
4341
  this.player.pause();
4276
4342
  this.armResume();
4277
4343
  return;
4278
4344
  }
4279
- if (t - this.overlapLastLoudAt > 300) {
4345
+ if (t - this.overlapLastLoudAt > 450) {
4280
4346
  this.pausedAt = t;
4281
4347
  this.overlapLoud = 1;
4282
4348
  this.overlapLastLoudAt = t;
@@ -4285,7 +4351,7 @@ var VoiceEngine = class {
4285
4351
  }
4286
4352
  this.overlapLastLoudAt = t;
4287
4353
  this.overlapLoud++;
4288
- if (t - this.pausedAt >= o.overlapSustainMs && this.overlapLoud >= 4) {
4354
+ if (t - this.pausedAt >= o.overlapSustainMs && this.overlapLoud >= 3) {
4289
4355
  const phase = this.ctxOpen ? "speaking" : "drain";
4290
4356
  this.interrupt();
4291
4357
  this.options.onBargeIn(phase);
@@ -4309,6 +4375,7 @@ var VoiceEngine = class {
4309
4375
  if (this.pausedAt && resume) this.player.resume?.();
4310
4376
  this.pausedAt = 0;
4311
4377
  this.overlapLoud = 0;
4378
+ this.loudTimes = [];
4312
4379
  }
4313
4380
  /** energy two-stage barge-in (heuristic tier only): spike over echo baseline → pause + confirm via STT */
4314
4381
  handleLevel(rms) {