agent.libx.js 0.89.9 → 0.92.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -851,6 +851,17 @@ ${out}`.trim() : out || "(command succeeded, no output)";
851
851
  );
852
852
  return `Started background job ${id} \u2014 poll with JobOutput({id:"${id}"}) / JobStatus, stop with JobKill.`;
853
853
  }
854
+ function exitSessionTool(onExit) {
855
+ return {
856
+ name: "ExitSession",
857
+ description: `End the current session and exit the CLI. Call this when the user says goodbye, asks to quit, or clearly indicates they want to stop the conversation (e.g. "ok bye", "that's all", "exit", "goodnight").`,
858
+ parameters: { type: "object", properties: {} },
859
+ async run() {
860
+ onExit();
861
+ return "Session ending. Goodbye!";
862
+ }
863
+ };
864
+ }
854
865
  function defaultTools() {
855
866
  return [bashTool, readTool, editTool];
856
867
  }
@@ -2835,7 +2846,15 @@ var Agent = class _Agent {
2835
2846
  toolCallsTotal += toolCalls.length;
2836
2847
  if (o.maxToolCalls && toolCallsTotal > o.maxToolCalls) return kill("max_tool_calls");
2837
2848
  for (const tc of toolCalls) {
2838
- const content = await this.dispatch(tc);
2849
+ const raw = await this.dispatch(tc);
2850
+ let content;
2851
+ if (typeof raw === "string") {
2852
+ content = raw;
2853
+ } else {
2854
+ const parts = [{ type: "text", text: raw.text }];
2855
+ for (const img of raw.images ?? []) parts.push(imagePart(`data:${img.mimeType};base64,${img.data}`));
2856
+ content = parts;
2857
+ }
2839
2858
  this.transcript.push({ role: "tool", tool_call_id: tc.id, name: tc.function.name, content });
2840
2859
  }
2841
2860
  }
@@ -2892,10 +2911,17 @@ var Agent = class _Agent {
2892
2911
  return earlyError;
2893
2912
  }
2894
2913
  let result;
2914
+ let images;
2895
2915
  let threw = false;
2896
2916
  try {
2897
2917
  log3.debug(`${tc.function.name}(${tc.function.arguments})`);
2898
- result = await tool.run(args, this.ctx);
2918
+ const raw = await tool.run(args, this.ctx);
2919
+ if (typeof raw === "string") {
2920
+ result = raw;
2921
+ } else {
2922
+ result = raw.text;
2923
+ images = raw.images;
2924
+ }
2899
2925
  } catch (e) {
2900
2926
  const msg = e instanceof Error ? e.message : String(e);
2901
2927
  log3.debug(`${tc.function.name} -> error: ${msg}`);
@@ -2905,7 +2931,12 @@ var Agent = class _Agent {
2905
2931
  if (!threw) result = await this.maybeAutoTest(tc.function.name, result);
2906
2932
  await hooks?.postToolUse?.(call, result, meta);
2907
2933
  this.options.host?.notify?.({ kind: "tool_result", id: tc.id ?? "", output: result, isError: threw });
2908
- return result;
2934
+ if (images?.length) {
2935
+ for (const img of images) {
2936
+ this.options.host?.notify?.({ kind: "tool_result_image", id: tc.id ?? "", dataUrl: `data:${img.mimeType};base64,${img.data}` });
2937
+ }
2938
+ }
2939
+ return images?.length ? { text: result, images } : result;
2909
2940
  }
2910
2941
  static WRITE_CLASS = ["Write", "Edit", "MultiEdit", "ApplyEdits"];
2911
2942
  /** Append an autoTest failure section to a write-class tool result, if configured. */
@@ -3452,6 +3483,11 @@ function digestRun(messages, maxChars) {
3452
3483
  import { MemFilesystem as MemFilesystem2 } from "@livx.cc/wcli/core";
3453
3484
  init_logging();
3454
3485
  var log7 = forComponent("DuplexAgent");
3486
+ function describeCall(call) {
3487
+ const v = call.args && Object.values(call.args).find((x) => typeof x === "string" && x.trim());
3488
+ const hint = v ? ` (${String(v).replace(/\s+/g, " ").trim().slice(0, 48)})` : "";
3489
+ return `${call.name}${hint}`;
3490
+ }
3455
3491
  var DuplexAgentOptions = class {
3456
3492
  /** Any ai.libx.js AIClient — shared by the voice and worker agents (routed by model). */
3457
3493
  ai;
@@ -3472,8 +3508,16 @@ var DuplexAgentOptions = class {
3472
3508
  /** Awaited BEFORE a delegated worker spawns — open a per-task checkpoint frame, audit, etc.
3473
3509
  * (post-spawn would race the worker's first edits). */
3474
3510
  onTaskStart;
3511
+ /** Re-voice throttled worker progress asides ('[task t1 progress] …') so long tasks aren't dead
3512
+ * air. Off by default — each update costs a voice turn (LLM call + speech). */
3513
+ progressUpdates = false;
3514
+ /** Min ms between progress re-voices per task. */
3515
+ progressIntervalMs = 25e3;
3516
+ /** Host overrides for QuickLook lookups (keyed by `what`). The engine's defaults go through the
3517
+ * (possibly jailed) fs — e.g. `.git/**` is deny-listed, so the CLI supplies 'branch' itself. */
3518
+ quickLook;
3475
3519
  };
3476
- var VOICE_SYSTEM_PROMPT = 'You are a spoken voice assistant \u2014 the user HEARS everything you say. Use short sentences. One idea per sentence. No markdown, no bullet lists, no code blocks, no headings, no emoji.\nKeep turns SHORT \u2014 one to three sentences, then stop. Never lecture, enumerate cases, or add caveats unprompted. Conversation is a fast exchange: give the one thing asked, and let the user pull more if they want it.\nYou work in a pair: you talk, and a background worker with FULL access to the user\'s environment (files, shell, web) does the hands-on work. You can find out or do ANYTHING by calling `Delegate` with a clear, self-contained brief \u2014 so NEVER tell the user you can\'t see, access, or do something. Delegate and find out. When the user mentions their project, folder, files, or environment ("this project", "the current folder", "my code"), delegate IMMEDIATELY \u2014 do not ask for paths or details the worker can discover itself. Never pretend to have done the work or invent results \u2014 the worker\'s report is your only source.\nAfter calling Delegate, tell the user you are on it in one short sentence, then end your turn. Do not wait for the result.\nResults arrive later as events like "[task t1 completed] \u2026" or "[task t1 failed] \u2026". When one arrives, summarize it for the ear in one or two short sentences. Never read raw file paths, diffs, or code aloud verbatim.\nDo not fire a second Delegate for work already in flight \u2014 check `TaskStatus` first. Use `CancelTask` when the user asks to stop something.';
3520
+ var VOICE_SYSTEM_PROMPT = 'You are a spoken voice assistant \u2014 the user HEARS everything you say. Use short sentences. One idea per sentence. No markdown, no bullet lists, no code blocks, no headings, no emoji.\nKeep turns SHORT \u2014 one to three sentences, then stop. Never lecture, enumerate cases, or add caveats unprompted. Conversation is a fast exchange: give the one thing asked, and let the user pull more if they want it.\nYou work in a pair: you talk, and a background worker with FULL access to the user\'s environment (files, shell, web) does the hands-on work. You can find out or do ANYTHING by calling `Delegate` with a clear, self-contained brief \u2014 so NEVER tell the user you can\'t see, access, or do something. Delegate and find out. When the user mentions their project, folder, files, or environment ("this project", "the current folder", "my code"), delegate IMMEDIATELY \u2014 do not ask for paths or details the worker can discover itself. Never pretend to have done the work or invent results \u2014 the worker\'s report is your only source.\nAfter calling Delegate, tell the user you are on it in one short sentence, then end your turn. Do not wait for the result.\nResults arrive later as events like "[task t1 completed] \u2026" or "[task t1 failed] \u2026". When one arrives, summarize it for the ear in one or two short sentences. "[task t1 progress] \u2026" events are interim status, NOT results \u2014 give at most a half-sentence aside ("still on it \u2014 running tests now") and end your turn. Never present progress as a finished result.\nNever read raw file paths, diffs, or code aloud verbatim.\nDo not fire a second Delegate for work already in flight \u2014 check `TaskStatus` first. Use `CancelTask` when the user asks to stop something.\nPRIORITY: when the user says goodbye or wants to end/finish/wrap up the session ("ok bye", "that\'s all", "let\'s finish", "let\'s end", "goodnight", "exit", "wrap up"), call `ExitSession` IMMEDIATELY \u2014 do not delegate, do not check status, just exit.\nFor TRIVIAL instant lookups only \u2014 current time, git branch, listing a folder, peeking at a small file \u2014 use `QuickLook` (instant, no task). Anything requiring searching, reasoning, running commands, or editing still goes through `Delegate`.\nNEVER claim to have stored, saved, or remembered something durably \u2014 you cannot. Anything the user wants persisted (their name, preferences, notes) must be Delegated so a worker writes it to memory.\nUser messages may arrive via speech-to-text and can carry transcription artifacts \u2014 odd words, cut-offs, homophones ("for you" vs "folder"). Read for INTENT, not surface text. If a message seems garbled or surprising, briefly confirm what they meant ("did you mean\u2026?") instead of answering the literal words.';
3477
3521
  var VOICE_STYLE_CONVERSATIONAL = `Speak like a person in a live conversation, not an assistant reading a script. React first, then deliver: a quick impulsive beat ("oh nice", "hmm, hold on", "ah, got it") before the substance. Use contractions always. Vary sentence length \u2014 some very short. Light fillers and backchannels are fine ("mm-hm", "right", "let's see") but at most one per reply \u2014 never stack them. When you delegate, say it like a human would ("hang on, let me actually dig into that \u2014 gimme a minute") instead of announcing a task. When a result comes back, react to it like you just found out ("okay so \u2014 turns out\u2026"). Match the user's energy: a quick question gets a quick answer \u2014 a few words is a perfectly good turn. Prefer a short answer plus an offer ("want the details?") over covering everything. Never narrate your own mechanics (no "I will now delegate", no task ids out loud).`;
3478
3522
  var DuplexAgent = class {
3479
3523
  options;
@@ -3493,7 +3537,10 @@ var DuplexAgent = class {
3493
3537
  model: o.voiceModel,
3494
3538
  stream: true,
3495
3539
  host: o.host,
3496
- systemPrompt: VOICE_SYSTEM_PROMPT + (o.voiceStyle === "conversational" ? "\n" + VOICE_STYLE_CONVERSATIONAL : ""),
3540
+ // Runtime context line: without it the voice confidently invents "facts" like today's date
3541
+ // (its training cutoff) instead of delegating or admitting it doesn't know.
3542
+ systemPrompt: VOICE_SYSTEM_PROMPT + (o.voiceStyle === "conversational" ? "\n" + VOICE_STYLE_CONVERSATIONAL : "") + `
3543
+ Today's date: ${(/* @__PURE__ */ new Date()).toDateString()}.`,
3497
3544
  instructionFiles: false,
3498
3545
  maxSteps: 8,
3499
3546
  // a voice turn should never loop
@@ -3502,7 +3549,7 @@ var DuplexAgent = class {
3502
3549
  // no defaultTools() — the voice can only Delegate, never touch files itself. Set AFTER the
3503
3550
  // voiceOptions spread (addTools() would be clobbered by the first prepare()); extra voice
3504
3551
  // tools come in via voiceOptions.tools and are merged here.
3505
- tools: [...o.voiceOptions?.tools ?? [], this.delegateTool(), this.taskStatusTool(), this.cancelTaskTool()]
3552
+ tools: [...o.voiceOptions?.tools ?? [], this.delegateTool(), this.taskStatusTool(), this.cancelTaskTool(), this.quickLookTool()]
3506
3553
  });
3507
3554
  }
3508
3555
  /** One user turn: the voice agent streams the reply (and may Delegate). Serialized with re-voice turns. */
@@ -3555,18 +3602,69 @@ ${recent}` : brief;
3555
3602
  spawnWorker(id, label, briefText) {
3556
3603
  const o = this.options;
3557
3604
  const controller = new AbortController();
3605
+ const base = o.workerOptions?.hooks;
3606
+ const report = o.progressUpdates ? this.progressReporter(id) : void 0;
3607
+ const hooks = report ? {
3608
+ ...base,
3609
+ preToolUse: async (call, meta) => {
3610
+ const d = await base?.preToolUse?.(call, meta);
3611
+ report.pre(call);
3612
+ return d;
3613
+ },
3614
+ postToolUse: async (call, result, meta) => {
3615
+ await base?.postToolUse?.(call, result, meta);
3616
+ report.post(call);
3617
+ }
3618
+ } : base;
3558
3619
  const worker = new Agent({
3559
3620
  ai: o.ai,
3560
3621
  fs: o.fs,
3561
3622
  model: o.workerModel,
3562
3623
  ...o.workerOptions,
3563
3624
  // may override ai/fs/model/tools/… —
3625
+ ...hooks ? { hooks } : {},
3564
3626
  signal: controller.signal
3565
3627
  // …but never the per-task cancellation signal
3566
3628
  });
3567
3629
  const promise = worker.run(briefText).then((res) => this.onWorkerSettled(id, res)).catch((err) => this.onWorkerFailed(id, err));
3568
3630
  this.tasks.set(id, { id, label, status: "running", controller, promise });
3569
3631
  }
3632
+ /** Throttled per-task progress: worker tool calls → at most one progress re-voice per interval.
3633
+ * Two sources, one throttle: completed steps (post) and a heartbeat for a SINGLE long tool call
3634
+ * (pre records the in-flight call; a self-cleaning timer narrates "still inside Bash — 70s").
3635
+ * Completion supersedes: nothing is emitted once the task has settled. */
3636
+ progressReporter(id) {
3637
+ let lastAt = Date.now();
3638
+ let steps = 0;
3639
+ let inflight = null;
3640
+ const due = () => {
3641
+ const rec = this.tasks.get(id);
3642
+ return rec && rec.status === "running" && Date.now() - lastAt >= this.options.progressIntervalMs ? rec : void 0;
3643
+ };
3644
+ const emit = (rec, line, call) => {
3645
+ lastAt = Date.now();
3646
+ this.notify("task_progress", `task ${id} (${rec.label}): ${line}`, { id, steps, call: call.name });
3647
+ this.queueRevoice(`[task ${id} progress] ${line}`);
3648
+ };
3649
+ const timer = setInterval(() => {
3650
+ const rec = this.tasks.get(id);
3651
+ if (!rec || rec.status !== "running") return clearInterval(timer);
3652
+ if (!inflight || !due()) return;
3653
+ emit(rec, `still inside ${describeCall(inflight.call)} \u2014 ${Math.round((Date.now() - inflight.at) / 1e3)}s on this step`, inflight.call);
3654
+ }, Math.max(this.options.progressIntervalMs, 250));
3655
+ timer.unref?.();
3656
+ return {
3657
+ pre: (call) => {
3658
+ inflight = { call, at: Date.now() };
3659
+ },
3660
+ post: (call) => {
3661
+ steps++;
3662
+ inflight = null;
3663
+ const rec = due();
3664
+ if (rec) emit(rec, `still running \u2014 ${steps} steps so far, now: ${describeCall(call)}`, call);
3665
+ }
3666
+ };
3667
+ }
3570
3668
  onWorkerSettled(id, res) {
3571
3669
  const rec = this.tasks.get(id);
3572
3670
  if (res.finishReason === "aborted" || rec.status === "cancelled") {
@@ -3627,6 +3725,58 @@ ${recent}` : brief;
3627
3725
  }
3628
3726
  };
3629
3727
  }
3728
+ /** Sub-100ms read-only lookups the voice may do itself — everything else stays Delegate-only.
3729
+ * fs-only (no shell; the engine is VFS-abstracted): time, git branch (.git/HEAD read), ls, file
3730
+ * head. Output is hard-capped so a lookup can never bloat the skinny voice context. */
3731
+ quickLookTool() {
3732
+ const CAP = 2e3;
3733
+ const kinds = [.../* @__PURE__ */ new Set(["time", "branch", "ls", "file", ...Object.keys(this.options.quickLook ?? {})])];
3734
+ return {
3735
+ name: "QuickLook",
3736
+ description: `Instant read-only lookup \u2014 one of: ${kinds.join(", ")}. For trivial facts only; anything needing search, commands, or reasoning goes through Delegate.`,
3737
+ parameters: {
3738
+ type: "object",
3739
+ required: ["what"],
3740
+ properties: {
3741
+ what: { type: "string", enum: kinds, description: "what to look up" },
3742
+ path: { type: "string", description: "for ls/file: the path to look at" }
3743
+ }
3744
+ },
3745
+ run: async ({ what, path }) => {
3746
+ const fs = this.options.fs;
3747
+ try {
3748
+ const over = this.options.quickLook?.[String(what)];
3749
+ if (over) return await over(path ? String(path) : void 0);
3750
+ switch (String(what)) {
3751
+ case "time":
3752
+ return (/* @__PURE__ */ new Date()).toString();
3753
+ case "branch": {
3754
+ if (!fs) return "unavailable (no filesystem)";
3755
+ const head = (await fs.readFile(".git/HEAD")).trim();
3756
+ return head.startsWith("ref: refs/heads/") ? `branch: ${head.slice("ref: refs/heads/".length)}` : `detached HEAD at ${head.slice(0, 12)}`;
3757
+ }
3758
+ case "ls": {
3759
+ if (!fs) return "unavailable (no filesystem)";
3760
+ const names = await fs.readDir(String(path ?? "."));
3761
+ return names.slice(0, 50).join("\n") + (names.length > 50 ? `
3762
+ \u2026 (+${names.length - 50} more)` : "");
3763
+ }
3764
+ case "file": {
3765
+ if (!fs) return "unavailable (no filesystem)";
3766
+ if (!path) return "file lookup needs a path";
3767
+ const text = await fs.readFile(String(path));
3768
+ return text.length > CAP ? text.slice(0, CAP) + `
3769
+ \u2026 (truncated \u2014 ${text.length} chars total; Delegate for the full file)` : text;
3770
+ }
3771
+ default:
3772
+ return `unknown lookup '${what}'`;
3773
+ }
3774
+ } catch (e) {
3775
+ return `lookup failed: ${e?.message ?? e}`;
3776
+ }
3777
+ }
3778
+ };
3779
+ }
3630
3780
  cancelTaskTool() {
3631
3781
  return {
3632
3782
  name: "CancelTask",
@@ -3645,15 +3795,26 @@ ${recent}` : brief;
3645
3795
  };
3646
3796
 
3647
3797
  // src/mcp.ts
3648
- function toText(result) {
3649
- if (result == null) return "";
3650
- if (typeof result === "string") return result;
3798
+ function toResult(result) {
3799
+ if (result == null) return { text: "" };
3800
+ if (typeof result === "string") return { text: result };
3651
3801
  const content = result.content;
3652
3802
  if (Array.isArray(content)) {
3653
- const text = content.map((c) => typeof c?.text === "string" ? c.text : JSON.stringify(c)).join("\n");
3654
- if (text) return text;
3803
+ const texts = [];
3804
+ const images = [];
3805
+ for (const c of content) {
3806
+ if (c?.type === "image" && typeof c.data === "string" && c.mimeType) {
3807
+ images.push({ mimeType: c.mimeType, data: c.data });
3808
+ } else if (typeof c?.text === "string") {
3809
+ texts.push(c.text);
3810
+ } else {
3811
+ texts.push(JSON.stringify(c));
3812
+ }
3813
+ }
3814
+ const text = texts.join("\n");
3815
+ if (text || images.length) return { text, ...images.length ? { images } : {} };
3655
3816
  }
3656
- return JSON.stringify(result);
3817
+ return { text: JSON.stringify(result) };
3657
3818
  }
3658
3819
  function mcpToolToAgentTool(spec, callTool, prefix = "mcp__") {
3659
3820
  return {
@@ -3661,7 +3822,8 @@ function mcpToolToAgentTool(spec, callTool, prefix = "mcp__") {
3661
3822
  description: spec.description ?? `MCP tool ${spec.name}`,
3662
3823
  parameters: spec.inputSchema ?? { type: "object", properties: {} },
3663
3824
  async run(args, _ctx) {
3664
- return toText(await callTool(spec.name, args ?? {}));
3825
+ const r = toResult(await callTool(spec.name, args ?? {}));
3826
+ return r.images?.length ? r : r.text;
3665
3827
  }
3666
3828
  };
3667
3829
  }
@@ -3703,7 +3865,8 @@ function makeMcpToolSearch(specs, callTool, options = {}) {
3703
3865
  async run({ name, args }) {
3704
3866
  const n = String(name ?? "");
3705
3867
  if (!byName.has(n)) return `Error: unknown MCP tool '${n}'. Use ToolSearch to find valid names.`;
3706
- return toText(await callTool(n, args ?? {}));
3868
+ const r = toResult(await callTool(n, args ?? {}));
3869
+ return r.images?.length ? r : r.text;
3707
3870
  }
3708
3871
  };
3709
3872
  return [searchTool, callMcpTool];
@@ -3761,11 +3924,471 @@ var RecordingLifecycle = class {
3761
3924
 
3762
3925
  // src/index.ts
3763
3926
  init_logging();
3927
+
3928
+ // src/voice/engine.ts
3929
+ init_logging();
3930
+ var log8 = forComponent("VoiceEngine");
3931
+ var now = () => performance.now();
3932
+ var VoiceEngineOptions = class {
3933
+ stt;
3934
+ tts;
3935
+ player;
3936
+ /** a final utterance arrived (endpoint) — host dispatches it as a turn */
3937
+ onUtterance = () => {
3938
+ };
3939
+ /** live partial transcript while listening (host renders the 🎤 line) */
3940
+ onPartial = () => {
3941
+ };
3942
+ onState = () => {
3943
+ };
3944
+ /** user spoke/acted over playback — host aborts the in-flight turn (called AFTER audio is killed).
3945
+ * phase: 'speaking' = cut mid-speech (real interruption); 'drain' = in the final audio tail
3946
+ * (normal turn-taking — hosts shouldn't alarm). */
3947
+ onBargeIn = () => {
3948
+ };
3949
+ /** spoken micro-ack on utterance endpoint (masks LLM TTFT); '' disables */
3950
+ ackPhrase = "";
3951
+ /** Endpoint merge window (ms): hold an endpointed utterance briefly — if speech resumes (spelled
3952
+ * letters, mid-thought pauses), the next utterance MERGES instead of dispatching a truncated one
3953
+ * ("E-L-Y." / "A."). Costs this much latency per turn; 0 disables. */
3954
+ utteranceMergeMs = 350;
3955
+ /** heuristic (non-AEC) energy barge-in tuning */
3956
+ bargeRmsMult = 2;
3957
+ bargeRmsFloor = 500;
3958
+ };
3959
+ var VoiceEngine = class {
3960
+ options;
3961
+ state = "idle";
3962
+ stt;
3963
+ tts;
3964
+ player;
3965
+ speaking = false;
3966
+ // audible (deltas flowing OR audio draining)
3967
+ ctxOpen = false;
3968
+ // the current TTS context still accepts deltas (false once end-frame sent)
3969
+ interrupted = false;
3970
+ // barge-in latch: drop in-flight deltas until the next legitimate turn
3971
+ spokeDeltas = false;
3972
+ // a TTS context is open for the current spoken turn
3973
+ drainTimer = null;
3974
+ // heuristic tier state (inert under AEC) — frozen as validated in the experiment
3975
+ echoWords = /* @__PURE__ */ new Set();
3976
+ prevReply = "";
3977
+ reply = "";
3978
+ echoUntil = 0;
3979
+ baseline = 0;
3980
+ hot = 0;
3981
+ suspectUntil = 0;
3982
+ ackAt = 0;
3983
+ // when the micro-ack was spoken — its echo can leak before the AEC filter converges
3984
+ pendingUtt = "";
3985
+ // endpointed text held for the merge window
3986
+ pendingTimer = null;
3987
+ lastInterrupted = null;
3988
+ constructor(options) {
3989
+ this.options = { ...new VoiceEngineOptions(), ...options };
3990
+ const o = this.options;
3991
+ if (!o.stt || !o.tts || !o.player) throw new Error("VoiceEngine needs stt, tts and player (see cli/voice.ts VoiceIO for platform defaults)");
3992
+ this.stt = o.stt;
3993
+ this.tts = o.tts;
3994
+ this.player = o.player;
3995
+ }
3996
+ async start() {
3997
+ this.tts.onAudio = (c) => {
3998
+ if (this.speaking) this.player.write(c);
3999
+ };
4000
+ this.stt.onPartial = (text) => this.handlePartial(text);
4001
+ this.stt.onUtterance = (text) => this.handleUtterance(text);
4002
+ this.stt.onLevel = (rms) => this.handleLevel(rms);
4003
+ await Promise.all([this.tts.connect(), this.stt.start()]);
4004
+ this.setState("listening");
4005
+ log8.info(`voice I/O up (${this.stt.usingAec ? "AEC" : "heuristic echo"} capture)`);
4006
+ }
4007
+ get usingAec() {
4008
+ return this.stt.usingAec;
4009
+ }
4010
+ idleWaiters = [];
4011
+ setState(s) {
4012
+ if (this.state === s) return;
4013
+ this.state = s;
4014
+ this.options.onState(s);
4015
+ if (s !== "speaking" && s !== "thinking") {
4016
+ for (const r of this.idleWaiters.splice(0)) r();
4017
+ }
4018
+ }
4019
+ /** Resolve when the engine is no longer speaking (immediate if already idle). */
4020
+ awaitIdle() {
4021
+ if (this.state !== "speaking" && this.state !== "thinking") return Promise.resolve();
4022
+ return new Promise((r) => this.idleWaiters.push(r));
4023
+ }
4024
+ // --- speaking side (host-driven) ---
4025
+ /** open a spoken turn (idempotent — safe from both onUtterance and first-delta paths).
4026
+ * `ack` speaks the configured micro-ack as the context opener (utterance path only —
4027
+ * masks LLM TTFT; re-voice turns begun by their first delta skip it). */
4028
+ beginSpeech(ack = false) {
4029
+ if (this.speaking && this.ctxOpen) return;
4030
+ if (this.drainTimer) {
4031
+ clearTimeout(this.drainTimer);
4032
+ this.drainTimer = null;
4033
+ }
4034
+ this.interrupted = false;
4035
+ if (!this.speaking) this.player.markTurn();
4036
+ this.speaking = true;
4037
+ this.ctxOpen = true;
4038
+ this.spokeDeltas = false;
4039
+ this.reply = "";
4040
+ this.echoWords = new Set(this.words(this.prevReply));
4041
+ this.tts.newContext();
4042
+ if (ack && this.options.ackPhrase) {
4043
+ this.tts.speak(this.options.ackPhrase + " ", true);
4044
+ this.spokeDeltas = true;
4045
+ this.ackAt = now();
4046
+ }
4047
+ this.setState("thinking");
4048
+ }
4049
+ speakDelta(text) {
4050
+ if (this.interrupted) return;
4051
+ if (!this.speaking || !this.ctxOpen) this.beginSpeech();
4052
+ this.reply += text;
4053
+ for (const w of this.words(this.reply)) this.echoWords.add(w);
4054
+ this.tts.speak(text, true);
4055
+ this.spokeDeltas = true;
4056
+ this.setState("speaking");
4057
+ }
4058
+ /** close the spoken turn (idempotent); stays audible until ALL audio arrived AND playback drains */
4059
+ endSpeech() {
4060
+ this.interrupted = false;
4061
+ if (!this.speaking) return;
4062
+ this.ctxOpen = false;
4063
+ if (this.reply) this.prevReply = this.reply;
4064
+ const settle = () => {
4065
+ if (this.ctxOpen) {
4066
+ this.drainTimer = null;
4067
+ return;
4068
+ }
4069
+ this.drainTimer = null;
4070
+ this.speaking = false;
4071
+ this.echoUntil = now() + 2500;
4072
+ if (!this.usingAec) this.stt.reset();
4073
+ this.setState("listening");
4074
+ };
4075
+ const drainThenSettle = () => {
4076
+ if (this.drainTimer) clearTimeout(this.drainTimer);
4077
+ this.drainTimer = setTimeout(settle, this.player.drainMs() + 300);
4078
+ };
4079
+ if (this.spokeDeltas) {
4080
+ this.tts.onDone = drainThenSettle;
4081
+ this.tts.end();
4082
+ if (this.drainTimer) clearTimeout(this.drainTimer);
4083
+ this.drainTimer = setTimeout(drainThenSettle, 15e3);
4084
+ } else drainThenSettle();
4085
+ }
4086
+ /** text of the reply cut by the last barge-in — consumed by the host to tell the model what
4087
+ * the user did NOT hear. Cleared on read. */
4088
+ takeInterruptedReply() {
4089
+ const r = this.lastInterrupted;
4090
+ this.lastInterrupted = null;
4091
+ return r;
4092
+ }
4093
+ /** barge-in: stop audio NOW, cancel generation, reset for the user's utterance */
4094
+ interrupt() {
4095
+ if (!this.speaking && !this.drainTimer) return;
4096
+ if (this.drainTimer) {
4097
+ clearTimeout(this.drainTimer);
4098
+ this.drainTimer = null;
4099
+ }
4100
+ const heardChars = Math.round(Math.max(0, this.player.playedMs()) / 1e3 * 15);
4101
+ if (this.reply) this.lastInterrupted = { full: this.reply, heard: this.reply.slice(0, heardChars) };
4102
+ this.speaking = false;
4103
+ this.ctxOpen = false;
4104
+ this.interrupted = true;
4105
+ this.suspectUntil = 0;
4106
+ this.echoUntil = now() + 2500;
4107
+ this.tts.cancel();
4108
+ this.player.kill();
4109
+ if (!this.usingAec) this.stt.reset();
4110
+ if (this.reply) this.prevReply = this.reply;
4111
+ this.setState("listening");
4112
+ }
4113
+ stop() {
4114
+ if (this.pendingTimer) clearTimeout(this.pendingTimer);
4115
+ if (this.drainTimer) clearTimeout(this.drainTimer);
4116
+ this.stt.stop();
4117
+ this.player.kill();
4118
+ this.tts.close();
4119
+ this.setState("idle");
4120
+ }
4121
+ // --- listening side (STT-driven) ---
4122
+ words(s) {
4123
+ return s.toLowerCase().replace(/[^a-z0-9\s]/g, "").split(/\s+/).filter((w) => w.length >= 2);
4124
+ }
4125
+ novelWords(text) {
4126
+ return this.words(text).filter((w) => !this.echoWords.has(w));
4127
+ }
4128
+ echoActive() {
4129
+ return this.speaking || now() < this.echoUntil;
4130
+ }
4131
+ handlePartial(text) {
4132
+ if (this.speaking) {
4133
+ const barge = this.novelWords(text).length >= (this.usingAec ? 1 : this.suspectUntil ? 1 : 2);
4134
+ if (barge) {
4135
+ const phase = this.ctxOpen ? "speaking" : "drain";
4136
+ this.interrupt();
4137
+ this.options.onBargeIn(phase);
4138
+ }
4139
+ return;
4140
+ }
4141
+ if (this.pendingUtt && text.trim()) {
4142
+ if (this.pendingTimer) {
4143
+ clearTimeout(this.pendingTimer);
4144
+ this.pendingTimer = null;
4145
+ }
4146
+ }
4147
+ if (!this.echoActive() || this.novelWords(text).length >= 1) this.options.onPartial(text);
4148
+ }
4149
+ handleUtterance(text) {
4150
+ if (this.echoActive() && this.novelWords(text).length < (this.usingAec ? 1 : 2)) {
4151
+ this.stt.reset();
4152
+ return;
4153
+ }
4154
+ const squash = (t) => t.toLowerCase().replace(/[^a-z]/g, "").replace(/(.)\1+/g, "$1");
4155
+ if (this.ackAt && now() - this.ackAt < 6e3 && squash(text) === squash(this.options.ackPhrase)) {
4156
+ this.ackAt = 0;
4157
+ return;
4158
+ }
4159
+ this.pendingUtt = this.pendingUtt ? `${this.pendingUtt} ${text}` : text;
4160
+ if (this.pendingTimer) clearTimeout(this.pendingTimer);
4161
+ if (!this.options.utteranceMergeMs) return this.flushUtterance();
4162
+ this.pendingTimer = setTimeout(() => this.flushUtterance(), this.options.utteranceMergeMs);
4163
+ }
4164
+ flushUtterance() {
4165
+ if (this.pendingTimer) {
4166
+ clearTimeout(this.pendingTimer);
4167
+ this.pendingTimer = null;
4168
+ }
4169
+ const text = this.pendingUtt;
4170
+ this.pendingUtt = "";
4171
+ if (text) this.options.onUtterance(text);
4172
+ }
4173
+ /** energy two-stage barge-in (heuristic tier only): spike over echo baseline → pause + confirm via STT */
4174
+ handleLevel(rms) {
4175
+ if (this.usingAec) return;
4176
+ if (!this.speaking) {
4177
+ this.baseline = 0;
4178
+ this.hot = 0;
4179
+ return;
4180
+ }
4181
+ if (!this.baseline) {
4182
+ this.baseline = rms;
4183
+ return;
4184
+ }
4185
+ this.baseline = this.baseline * 0.9 + rms * 0.1;
4186
+ if (rms > Math.max(this.baseline * this.options.bargeRmsMult, this.options.bargeRmsFloor)) this.hot++;
4187
+ else this.hot = 0;
4188
+ if (this.hot >= 2 && !this.suspectUntil) {
4189
+ this.suspectUntil = now() + 1300;
4190
+ setTimeout(() => {
4191
+ this.suspectUntil = 0;
4192
+ }, 1350);
4193
+ }
4194
+ }
4195
+ };
4196
+
4197
+ // src/voice/soniox.ts
4198
+ init_logging();
4199
+
4200
+ // src/voice/types.ts
4201
+ var STT_SAMPLE_RATE = 16e3;
4202
+ var TTS_SAMPLE_RATE = 44100;
4203
+ async function resolveAuth(auth) {
4204
+ return typeof auth === "function" ? await auth() : auth;
4205
+ }
4206
+
4207
+ // src/voice/soniox.ts
4208
+ var log9 = forComponent("SonioxSTT");
4209
+ var now2 = () => performance.now();
4210
+ var SonioxSTTOptions = class {
4211
+ auth = "";
4212
+ source;
4213
+ model = "stt-rt-preview";
4214
+ languageHints = ["en"];
4215
+ };
4216
+ var SonioxSTT = class {
4217
+ options;
4218
+ ws;
4219
+ stopped = false;
4220
+ sourceStarted = false;
4221
+ onPartial = () => {
4222
+ };
4223
+ onUtterance = () => {
4224
+ };
4225
+ /** mic energy (RMS) per chunk — drives the energy-based heuristic barge-in tier */
4226
+ onLevel = () => {
4227
+ };
4228
+ finalText = "";
4229
+ partialText = "";
4230
+ constructor(options) {
4231
+ this.options = { ...new SonioxSTTOptions(), ...options };
4232
+ }
4233
+ get usingAec() {
4234
+ return this.options.source?.aec ?? false;
4235
+ }
4236
+ async connectWs() {
4237
+ const apiKey = await resolveAuth(this.options.auth);
4238
+ this.ws = new WebSocket("wss://stt-rt.soniox.com/transcribe-websocket");
4239
+ await new Promise((res, rej) => {
4240
+ this.ws.onopen = () => res();
4241
+ this.ws.onerror = (e) => rej(new Error(`soniox ws: ${e.message || "connect failed"}`));
4242
+ });
4243
+ this.ws.send(
4244
+ JSON.stringify({
4245
+ api_key: apiKey,
4246
+ model: this.options.model,
4247
+ audio_format: "pcm_s16le",
4248
+ sample_rate: STT_SAMPLE_RATE,
4249
+ num_channels: 1,
4250
+ language_hints: this.options.languageHints,
4251
+ enable_endpoint_detection: true
4252
+ })
4253
+ );
4254
+ this.ws.onmessage = (ev) => this.handle(JSON.parse(String(ev.data)));
4255
+ this.ws.onclose = (ev) => {
4256
+ if (this.stopped) return;
4257
+ log9.warn(`soniox ws closed (${ev.code} ${ev.reason || ""}) \u2014 reconnecting`);
4258
+ this.reset();
4259
+ this.connectWs().catch((e) => log9.error(`soniox reconnect failed: ${e.message}`));
4260
+ };
4261
+ }
4262
+ async start() {
4263
+ await this.connectWs();
4264
+ if (this.sourceStarted) return;
4265
+ this.sourceStarted = true;
4266
+ await this.options.source.start((chunk) => {
4267
+ let sum = 0;
4268
+ const view = new DataView(chunk.buffer, chunk.byteOffset, chunk.byteLength);
4269
+ for (let i = 0; i + 1 < chunk.byteLength; i += 2) {
4270
+ const v = view.getInt16(i, true);
4271
+ sum += v * v;
4272
+ }
4273
+ this.onLevel(Math.sqrt(sum / (chunk.byteLength / 2)));
4274
+ if (this.ws.readyState === WebSocket.OPEN) this.ws.send(chunk);
4275
+ });
4276
+ }
4277
+ handle(m) {
4278
+ if (m.error_message) return log9.error(`soniox: ${m.error_message}`);
4279
+ let endpoint = false;
4280
+ for (const t of m.tokens ?? []) {
4281
+ if (t.text === "<end>") endpoint = true;
4282
+ else if (t.is_final) this.finalText += t.text;
4283
+ }
4284
+ this.partialText = (m.tokens ?? []).filter((t) => !t.is_final && t.text !== "<end>").map((t) => t.text).join("");
4285
+ this.onPartial(this.finalText + this.partialText);
4286
+ if (endpoint && this.finalText.trim()) {
4287
+ const utterance = this.finalText.trim();
4288
+ this.reset();
4289
+ this.onUtterance(utterance, now2());
4290
+ }
4291
+ }
4292
+ reset() {
4293
+ this.finalText = "";
4294
+ this.partialText = "";
4295
+ }
4296
+ stop() {
4297
+ this.stopped = true;
4298
+ this.options.source?.stop();
4299
+ if (this.ws) this.ws.onclose = null;
4300
+ this.ws?.close();
4301
+ }
4302
+ };
4303
+
4304
+ // src/voice/cartesia.ts
4305
+ init_logging();
4306
+ var log10 = forComponent("CartesiaTTS");
4307
+ var now3 = () => performance.now();
4308
+ var CartesiaTTSOptions = class {
4309
+ auth = "";
4310
+ voiceId = "";
4311
+ model = "sonic-3.5";
4312
+ /** 'apiKey' (server/CLI) → `api_key=` URL param; 'token' (browser, BE-minted) → `access_token=`. */
4313
+ authMode = "apiKey";
4314
+ };
4315
+ var CartesiaTTS = class {
4316
+ options;
4317
+ ws;
4318
+ ctxSeq = 0;
4319
+ ctxId = "";
4320
+ onAudio = () => {
4321
+ };
4322
+ onDone = () => {
4323
+ };
4324
+ firstAudioAt = 0;
4325
+ constructor(options) {
4326
+ this.options = { ...new CartesiaTTSOptions(), ...options };
4327
+ }
4328
+ async connect() {
4329
+ const key = await resolveAuth(this.options.auth);
4330
+ const param = this.options.authMode === "token" ? "access_token" : "api_key";
4331
+ this.ws = new WebSocket(`wss://api.cartesia.ai/tts/websocket?cartesia_version=2026-03-01&${param}=${key}`);
4332
+ await new Promise((res, rej) => {
4333
+ this.ws.onopen = () => res();
4334
+ this.ws.onerror = (e) => rej(new Error(`cartesia ws: ${e.message || "connect failed"}`));
4335
+ });
4336
+ this.ws.onclose = (ev) => log10.warn(`cartesia ws closed (${ev.code} ${ev.reason || ""})`);
4337
+ this.ws.onmessage = (ev) => {
4338
+ const m = JSON.parse(String(ev.data));
4339
+ if (m.context_id && m.context_id !== this.ctxId) return;
4340
+ if (m.type === "chunk" && m.data) {
4341
+ if (!this.firstAudioAt) this.firstAudioAt = now3();
4342
+ this.onAudio(base64ToBytes(m.data));
4343
+ } else if (m.type === "done") this.onDone();
4344
+ else if (m.type === "error" && !/already been cancelled|does not exist/.test(m.message || "")) log10.warn(`cartesia: ${JSON.stringify(m)}`);
4345
+ };
4346
+ }
4347
+ newContext() {
4348
+ this.ctxId = `ctx-${++this.ctxSeq}`;
4349
+ this.firstAudioAt = 0;
4350
+ return this.ctxId;
4351
+ }
4352
+ frame(transcript, cont) {
4353
+ return JSON.stringify({
4354
+ model_id: this.options.model,
4355
+ transcript,
4356
+ voice: { mode: "id", id: this.options.voiceId },
4357
+ output_format: { container: "raw", encoding: "pcm_s16le", sample_rate: TTS_SAMPLE_RATE },
4358
+ context_id: this.ctxId,
4359
+ continue: cont
4360
+ });
4361
+ }
4362
+ speak(text, cont) {
4363
+ if (this.ws?.readyState === WebSocket.OPEN) this.ws.send(this.frame(text, cont));
4364
+ }
4365
+ end() {
4366
+ if (this.ws?.readyState === WebSocket.OPEN) this.ws.send(this.frame("", false));
4367
+ }
4368
+ cancel() {
4369
+ if (this.ws?.readyState === WebSocket.OPEN) this.ws.send(JSON.stringify({ context_id: this.ctxId, cancel: true }));
4370
+ }
4371
+ close() {
4372
+ if (this.ws) this.ws.onclose = null;
4373
+ this.ws?.close();
4374
+ }
4375
+ };
4376
+ function base64ToBytes(b64) {
4377
+ if (typeof Buffer !== "undefined") return Buffer.from(b64, "base64");
4378
+ const bin = atob(b64);
4379
+ const out = new Uint8Array(bin.length);
4380
+ for (let i = 0; i < bin.length; i++) out[i] = bin.charCodeAt(i);
4381
+ return out;
4382
+ }
4383
+
4384
+ // src/index.ts
3764
4385
  import { MemFilesystem as MemFilesystem3, IndexedDbFilesystem, CommandExecutor as CommandExecutor2, registerHeadlessCommands as registerHeadlessCommands2 } from "@livx.cc/wcli/core";
3765
4386
  export {
3766
4387
  Agent,
3767
4388
  AgentOptions,
3768
4389
  BodDbFilesystem,
4390
+ CartesiaTTS,
4391
+ CartesiaTTSOptions,
3769
4392
  CommandExecutor2 as CommandExecutor,
3770
4393
  ConsoleHostBridge,
3771
4394
  DEFAULT_DENY,
@@ -3785,9 +4408,15 @@ export {
3785
4408
  PermissionPolicy,
3786
4409
  RecordingHooks,
3787
4410
  RecordingLifecycle,
4411
+ STT_SAMPLE_RATE,
3788
4412
  SandboxJobRegistry,
3789
4413
  ScriptedHostBridge,
4414
+ SonioxSTT,
4415
+ SonioxSTTOptions,
4416
+ TTS_SAMPLE_RATE,
3790
4417
  VOICE_SYSTEM_PROMPT,
4418
+ VoiceEngine,
4419
+ VoiceEngineOptions,
3791
4420
  applyEditsTool,
3792
4421
  askUserQuestionTool,
3793
4422
  bashTool,
@@ -3799,6 +4428,7 @@ export {
3799
4428
  defaultTools,
3800
4429
  diskAgentOptions,
3801
4430
  editTool,
4431
+ exitSessionTool,
3802
4432
  expandCommand,
3803
4433
  expandTemplate,
3804
4434
  forComponent,
@@ -3835,6 +4465,7 @@ export {
3835
4465
  relevanceScore,
3836
4466
  repoIndex,
3837
4467
  repoMapTool,
4468
+ resolveAuth,
3838
4469
  rollbackTool,
3839
4470
  sandboxAgentOptions,
3840
4471
  slugify,