agent.libx.js 0.89.9 → 0.92.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -851,6 +851,17 @@ ${out}`.trim() : out || "(command succeeded, no output)";
851
851
  );
852
852
  return `Started background job ${id} \u2014 poll with JobOutput({id:"${id}"}) / JobStatus, stop with JobKill.`;
853
853
  }
854
+ function exitSessionTool(onExit) {
855
+ return {
856
+ name: "ExitSession",
857
+ description: `End the current session and exit the CLI. Call this when the user says goodbye, asks to quit, or clearly indicates they want to stop the conversation (e.g. "ok bye", "that's all", "exit", "goodnight").`,
858
+ parameters: { type: "object", properties: {} },
859
+ async run() {
860
+ onExit();
861
+ return "Session ending. Goodbye!";
862
+ }
863
+ };
864
+ }
854
865
  function defaultTools() {
855
866
  return [bashTool, readTool, editTool];
856
867
  }
@@ -2835,7 +2846,15 @@ var Agent = class _Agent {
2835
2846
  toolCallsTotal += toolCalls.length;
2836
2847
  if (o.maxToolCalls && toolCallsTotal > o.maxToolCalls) return kill("max_tool_calls");
2837
2848
  for (const tc of toolCalls) {
2838
- const content = await this.dispatch(tc);
2849
+ const raw = await this.dispatch(tc);
2850
+ let content;
2851
+ if (typeof raw === "string") {
2852
+ content = raw;
2853
+ } else {
2854
+ const parts = [{ type: "text", text: raw.text }];
2855
+ for (const img of raw.images ?? []) parts.push(imagePart(`data:${img.mimeType};base64,${img.data}`));
2856
+ content = parts;
2857
+ }
2839
2858
  this.transcript.push({ role: "tool", tool_call_id: tc.id, name: tc.function.name, content });
2840
2859
  }
2841
2860
  }
@@ -2892,10 +2911,17 @@ var Agent = class _Agent {
2892
2911
  return earlyError;
2893
2912
  }
2894
2913
  let result;
2914
+ let images;
2895
2915
  let threw = false;
2896
2916
  try {
2897
2917
  log3.debug(`${tc.function.name}(${tc.function.arguments})`);
2898
- result = await tool.run(args, this.ctx);
2918
+ const raw = await tool.run(args, this.ctx);
2919
+ if (typeof raw === "string") {
2920
+ result = raw;
2921
+ } else {
2922
+ result = raw.text;
2923
+ images = raw.images;
2924
+ }
2899
2925
  } catch (e) {
2900
2926
  const msg = e instanceof Error ? e.message : String(e);
2901
2927
  log3.debug(`${tc.function.name} -> error: ${msg}`);
@@ -2905,7 +2931,12 @@ var Agent = class _Agent {
2905
2931
  if (!threw) result = await this.maybeAutoTest(tc.function.name, result);
2906
2932
  await hooks?.postToolUse?.(call, result, meta);
2907
2933
  this.options.host?.notify?.({ kind: "tool_result", id: tc.id ?? "", output: result, isError: threw });
2908
- return result;
2934
+ if (images?.length) {
2935
+ for (const img of images) {
2936
+ this.options.host?.notify?.({ kind: "tool_result_image", id: tc.id ?? "", dataUrl: `data:${img.mimeType};base64,${img.data}` });
2937
+ }
2938
+ }
2939
+ return images?.length ? { text: result, images } : result;
2909
2940
  }
2910
2941
  static WRITE_CLASS = ["Write", "Edit", "MultiEdit", "ApplyEdits"];
2911
2942
  /** Append an autoTest failure section to a write-class tool result, if configured. */
@@ -3472,8 +3503,11 @@ var DuplexAgentOptions = class {
3472
3503
  /** Awaited BEFORE a delegated worker spawns — open a per-task checkpoint frame, audit, etc.
3473
3504
  * (post-spawn would race the worker's first edits). */
3474
3505
  onTaskStart;
3506
+ /** Host overrides for QuickLook lookups (keyed by `what`). The engine's defaults go through the
3507
+ * (possibly jailed) fs — e.g. `.git/**` is deny-listed, so the CLI supplies 'branch' itself. */
3508
+ quickLook;
3475
3509
  };
3476
- var VOICE_SYSTEM_PROMPT = 'You are a spoken voice assistant \u2014 the user HEARS everything you say. Use short sentences. One idea per sentence. No markdown, no bullet lists, no code blocks, no headings, no emoji.\nKeep turns SHORT \u2014 one to three sentences, then stop. Never lecture, enumerate cases, or add caveats unprompted. Conversation is a fast exchange: give the one thing asked, and let the user pull more if they want it.\nYou work in a pair: you talk, and a background worker with FULL access to the user\'s environment (files, shell, web) does the hands-on work. You can find out or do ANYTHING by calling `Delegate` with a clear, self-contained brief \u2014 so NEVER tell the user you can\'t see, access, or do something. Delegate and find out. When the user mentions their project, folder, files, or environment ("this project", "the current folder", "my code"), delegate IMMEDIATELY \u2014 do not ask for paths or details the worker can discover itself. Never pretend to have done the work or invent results \u2014 the worker\'s report is your only source.\nAfter calling Delegate, tell the user you are on it in one short sentence, then end your turn. Do not wait for the result.\nResults arrive later as events like "[task t1 completed] \u2026" or "[task t1 failed] \u2026". When one arrives, summarize it for the ear in one or two short sentences. Never read raw file paths, diffs, or code aloud verbatim.\nDo not fire a second Delegate for work already in flight \u2014 check `TaskStatus` first. Use `CancelTask` when the user asks to stop something.';
3510
+ var VOICE_SYSTEM_PROMPT = 'You are a spoken voice assistant \u2014 the user HEARS everything you say. Use short sentences. One idea per sentence. No markdown, no bullet lists, no code blocks, no headings, no emoji.\nKeep turns SHORT \u2014 one to three sentences, then stop. Never lecture, enumerate cases, or add caveats unprompted. Conversation is a fast exchange: give the one thing asked, and let the user pull more if they want it.\nYou work in a pair: you talk, and a background worker with FULL access to the user\'s environment (files, shell, web) does the hands-on work. You can find out or do ANYTHING by calling `Delegate` with a clear, self-contained brief \u2014 so NEVER tell the user you can\'t see, access, or do something. Delegate and find out. When the user mentions their project, folder, files, or environment ("this project", "the current folder", "my code"), delegate IMMEDIATELY \u2014 do not ask for paths or details the worker can discover itself. Never pretend to have done the work or invent results \u2014 the worker\'s report is your only source.\nAfter calling Delegate, tell the user you are on it in one short sentence, then end your turn. Do not wait for the result.\nResults arrive later as events like "[task t1 completed] \u2026" or "[task t1 failed] \u2026". When one arrives, summarize it for the ear in one or two short sentences. Never read raw file paths, diffs, or code aloud verbatim.\nDo not fire a second Delegate for work already in flight \u2014 check `TaskStatus` first. Use `CancelTask` when the user asks to stop something.\nPRIORITY: when the user says goodbye or wants to end/finish/wrap up the session ("ok bye", "that\'s all", "let\'s finish", "let\'s end", "goodnight", "exit", "wrap up"), call `ExitSession` IMMEDIATELY \u2014 do not delegate, do not check status, just exit.\nFor TRIVIAL instant lookups only \u2014 current time, git branch, listing a folder, peeking at a small file \u2014 use `QuickLook` (instant, no task). Anything requiring searching, reasoning, running commands, or editing still goes through `Delegate`.\nNEVER claim to have stored, saved, or remembered something durably \u2014 you cannot. Anything the user wants persisted (their name, preferences, notes) must be Delegated so a worker writes it to memory.\nUser messages may arrive via speech-to-text and can carry transcription artifacts \u2014 odd words, cut-offs, homophones ("for you" vs "folder"). Read for INTENT, not surface text. If a message seems garbled or surprising, briefly confirm what they meant ("did you mean\u2026?") instead of answering the literal words.';
3477
3511
  var VOICE_STYLE_CONVERSATIONAL = `Speak like a person in a live conversation, not an assistant reading a script. React first, then deliver: a quick impulsive beat ("oh nice", "hmm, hold on", "ah, got it") before the substance. Use contractions always. Vary sentence length \u2014 some very short. Light fillers and backchannels are fine ("mm-hm", "right", "let's see") but at most one per reply \u2014 never stack them. When you delegate, say it like a human would ("hang on, let me actually dig into that \u2014 gimme a minute") instead of announcing a task. When a result comes back, react to it like you just found out ("okay so \u2014 turns out\u2026"). Match the user's energy: a quick question gets a quick answer \u2014 a few words is a perfectly good turn. Prefer a short answer plus an offer ("want the details?") over covering everything. Never narrate your own mechanics (no "I will now delegate", no task ids out loud).`;
3478
3512
  var DuplexAgent = class {
3479
3513
  options;
@@ -3493,7 +3527,10 @@ var DuplexAgent = class {
3493
3527
  model: o.voiceModel,
3494
3528
  stream: true,
3495
3529
  host: o.host,
3496
- systemPrompt: VOICE_SYSTEM_PROMPT + (o.voiceStyle === "conversational" ? "\n" + VOICE_STYLE_CONVERSATIONAL : ""),
3530
+ // Runtime context line: without it the voice confidently invents "facts" like today's date
3531
+ // (its training cutoff) instead of delegating or admitting it doesn't know.
3532
+ systemPrompt: VOICE_SYSTEM_PROMPT + (o.voiceStyle === "conversational" ? "\n" + VOICE_STYLE_CONVERSATIONAL : "") + `
3533
+ Today's date: ${(/* @__PURE__ */ new Date()).toDateString()}.`,
3497
3534
  instructionFiles: false,
3498
3535
  maxSteps: 8,
3499
3536
  // a voice turn should never loop
@@ -3502,7 +3539,7 @@ var DuplexAgent = class {
3502
3539
  // no defaultTools() — the voice can only Delegate, never touch files itself. Set AFTER the
3503
3540
  // voiceOptions spread (addTools() would be clobbered by the first prepare()); extra voice
3504
3541
  // tools come in via voiceOptions.tools and are merged here.
3505
- tools: [...o.voiceOptions?.tools ?? [], this.delegateTool(), this.taskStatusTool(), this.cancelTaskTool()]
3542
+ tools: [...o.voiceOptions?.tools ?? [], this.delegateTool(), this.taskStatusTool(), this.cancelTaskTool(), this.quickLookTool()]
3506
3543
  });
3507
3544
  }
3508
3545
  /** One user turn: the voice agent streams the reply (and may Delegate). Serialized with re-voice turns. */
@@ -3627,6 +3664,58 @@ ${recent}` : brief;
3627
3664
  }
3628
3665
  };
3629
3666
  }
3667
+ /** Sub-100ms read-only lookups the voice may do itself — everything else stays Delegate-only.
3668
+ * fs-only (no shell; the engine is VFS-abstracted): time, git branch (.git/HEAD read), ls, file
3669
+ * head. Output is hard-capped so a lookup can never bloat the skinny voice context. */
3670
+ quickLookTool() {
3671
+ const CAP = 2e3;
3672
+ const kinds = [.../* @__PURE__ */ new Set(["time", "branch", "ls", "file", ...Object.keys(this.options.quickLook ?? {})])];
3673
+ return {
3674
+ name: "QuickLook",
3675
+ description: `Instant read-only lookup \u2014 one of: ${kinds.join(", ")}. For trivial facts only; anything needing search, commands, or reasoning goes through Delegate.`,
3676
+ parameters: {
3677
+ type: "object",
3678
+ required: ["what"],
3679
+ properties: {
3680
+ what: { type: "string", enum: kinds, description: "what to look up" },
3681
+ path: { type: "string", description: "for ls/file: the path to look at" }
3682
+ }
3683
+ },
3684
+ run: async ({ what, path }) => {
3685
+ const fs = this.options.fs;
3686
+ try {
3687
+ const over = this.options.quickLook?.[String(what)];
3688
+ if (over) return await over(path ? String(path) : void 0);
3689
+ switch (String(what)) {
3690
+ case "time":
3691
+ return (/* @__PURE__ */ new Date()).toString();
3692
+ case "branch": {
3693
+ if (!fs) return "unavailable (no filesystem)";
3694
+ const head = (await fs.readFile(".git/HEAD")).trim();
3695
+ return head.startsWith("ref: refs/heads/") ? `branch: ${head.slice("ref: refs/heads/".length)}` : `detached HEAD at ${head.slice(0, 12)}`;
3696
+ }
3697
+ case "ls": {
3698
+ if (!fs) return "unavailable (no filesystem)";
3699
+ const names = await fs.readDir(String(path ?? "."));
3700
+ return names.slice(0, 50).join("\n") + (names.length > 50 ? `
3701
+ \u2026 (+${names.length - 50} more)` : "");
3702
+ }
3703
+ case "file": {
3704
+ if (!fs) return "unavailable (no filesystem)";
3705
+ if (!path) return "file lookup needs a path";
3706
+ const text = await fs.readFile(String(path));
3707
+ return text.length > CAP ? text.slice(0, CAP) + `
3708
+ \u2026 (truncated \u2014 ${text.length} chars total; Delegate for the full file)` : text;
3709
+ }
3710
+ default:
3711
+ return `unknown lookup '${what}'`;
3712
+ }
3713
+ } catch (e) {
3714
+ return `lookup failed: ${e?.message ?? e}`;
3715
+ }
3716
+ }
3717
+ };
3718
+ }
3630
3719
  cancelTaskTool() {
3631
3720
  return {
3632
3721
  name: "CancelTask",
@@ -3645,15 +3734,26 @@ ${recent}` : brief;
3645
3734
  };
3646
3735
 
3647
3736
  // src/mcp.ts
3648
- function toText(result) {
3649
- if (result == null) return "";
3650
- if (typeof result === "string") return result;
3737
+ function toResult(result) {
3738
+ if (result == null) return { text: "" };
3739
+ if (typeof result === "string") return { text: result };
3651
3740
  const content = result.content;
3652
3741
  if (Array.isArray(content)) {
3653
- const text = content.map((c) => typeof c?.text === "string" ? c.text : JSON.stringify(c)).join("\n");
3654
- if (text) return text;
3742
+ const texts = [];
3743
+ const images = [];
3744
+ for (const c of content) {
3745
+ if (c?.type === "image" && typeof c.data === "string" && c.mimeType) {
3746
+ images.push({ mimeType: c.mimeType, data: c.data });
3747
+ } else if (typeof c?.text === "string") {
3748
+ texts.push(c.text);
3749
+ } else {
3750
+ texts.push(JSON.stringify(c));
3751
+ }
3752
+ }
3753
+ const text = texts.join("\n");
3754
+ if (text || images.length) return { text, ...images.length ? { images } : {} };
3655
3755
  }
3656
- return JSON.stringify(result);
3756
+ return { text: JSON.stringify(result) };
3657
3757
  }
3658
3758
  function mcpToolToAgentTool(spec, callTool, prefix = "mcp__") {
3659
3759
  return {
@@ -3661,7 +3761,8 @@ function mcpToolToAgentTool(spec, callTool, prefix = "mcp__") {
3661
3761
  description: spec.description ?? `MCP tool ${spec.name}`,
3662
3762
  parameters: spec.inputSchema ?? { type: "object", properties: {} },
3663
3763
  async run(args, _ctx) {
3664
- return toText(await callTool(spec.name, args ?? {}));
3764
+ const r = toResult(await callTool(spec.name, args ?? {}));
3765
+ return r.images?.length ? r : r.text;
3665
3766
  }
3666
3767
  };
3667
3768
  }
@@ -3703,7 +3804,8 @@ function makeMcpToolSearch(specs, callTool, options = {}) {
3703
3804
  async run({ name, args }) {
3704
3805
  const n = String(name ?? "");
3705
3806
  if (!byName.has(n)) return `Error: unknown MCP tool '${n}'. Use ToolSearch to find valid names.`;
3706
- return toText(await callTool(n, args ?? {}));
3807
+ const r = toResult(await callTool(n, args ?? {}));
3808
+ return r.images?.length ? r : r.text;
3707
3809
  }
3708
3810
  };
3709
3811
  return [searchTool, callMcpTool];
@@ -3761,11 +3863,471 @@ var RecordingLifecycle = class {
3761
3863
 
3762
3864
  // src/index.ts
3763
3865
  init_logging();
3866
+
3867
+ // src/voice/engine.ts
3868
+ init_logging();
3869
+ var log8 = forComponent("VoiceEngine");
3870
+ var now = () => performance.now();
3871
+ var VoiceEngineOptions = class {
3872
+ stt;
3873
+ tts;
3874
+ player;
3875
+ /** a final utterance arrived (endpoint) — host dispatches it as a turn */
3876
+ onUtterance = () => {
3877
+ };
3878
+ /** live partial transcript while listening (host renders the 🎤 line) */
3879
+ onPartial = () => {
3880
+ };
3881
+ onState = () => {
3882
+ };
3883
+ /** user spoke/acted over playback — host aborts the in-flight turn (called AFTER audio is killed).
3884
+ * phase: 'speaking' = cut mid-speech (real interruption); 'drain' = in the final audio tail
3885
+ * (normal turn-taking — hosts shouldn't alarm). */
3886
+ onBargeIn = () => {
3887
+ };
3888
+ /** spoken micro-ack on utterance endpoint (masks LLM TTFT); '' disables */
3889
+ ackPhrase = "";
3890
+ /** Endpoint merge window (ms): hold an endpointed utterance briefly — if speech resumes (spelled
3891
+ * letters, mid-thought pauses), the next utterance MERGES instead of dispatching a truncated one
3892
+ * ("E-L-Y." / "A."). Costs this much latency per turn; 0 disables. */
3893
+ utteranceMergeMs = 350;
3894
+ /** heuristic (non-AEC) energy barge-in tuning */
3895
+ bargeRmsMult = 2;
3896
+ bargeRmsFloor = 500;
3897
+ };
3898
+ var VoiceEngine = class {
3899
+ options;
3900
+ state = "idle";
3901
+ stt;
3902
+ tts;
3903
+ player;
3904
+ speaking = false;
3905
+ // audible (deltas flowing OR audio draining)
3906
+ ctxOpen = false;
3907
+ // the current TTS context still accepts deltas (false once end-frame sent)
3908
+ interrupted = false;
3909
+ // barge-in latch: drop in-flight deltas until the next legitimate turn
3910
+ spokeDeltas = false;
3911
+ // a TTS context is open for the current spoken turn
3912
+ drainTimer = null;
3913
+ // heuristic tier state (inert under AEC) — frozen as validated in the experiment
3914
+ echoWords = /* @__PURE__ */ new Set();
3915
+ prevReply = "";
3916
+ reply = "";
3917
+ echoUntil = 0;
3918
+ baseline = 0;
3919
+ hot = 0;
3920
+ suspectUntil = 0;
3921
+ ackAt = 0;
3922
+ // when the micro-ack was spoken — its echo can leak before the AEC filter converges
3923
+ pendingUtt = "";
3924
+ // endpointed text held for the merge window
3925
+ pendingTimer = null;
3926
+ lastInterrupted = null;
3927
+ constructor(options) {
3928
+ this.options = { ...new VoiceEngineOptions(), ...options };
3929
+ const o = this.options;
3930
+ if (!o.stt || !o.tts || !o.player) throw new Error("VoiceEngine needs stt, tts and player (see cli/voice.ts VoiceIO for platform defaults)");
3931
+ this.stt = o.stt;
3932
+ this.tts = o.tts;
3933
+ this.player = o.player;
3934
+ }
3935
+ async start() {
3936
+ this.tts.onAudio = (c) => {
3937
+ if (this.speaking) this.player.write(c);
3938
+ };
3939
+ this.stt.onPartial = (text) => this.handlePartial(text);
3940
+ this.stt.onUtterance = (text) => this.handleUtterance(text);
3941
+ this.stt.onLevel = (rms) => this.handleLevel(rms);
3942
+ await Promise.all([this.tts.connect(), this.stt.start()]);
3943
+ this.setState("listening");
3944
+ log8.info(`voice I/O up (${this.stt.usingAec ? "AEC" : "heuristic echo"} capture)`);
3945
+ }
3946
+ get usingAec() {
3947
+ return this.stt.usingAec;
3948
+ }
3949
+ idleWaiters = [];
3950
+ setState(s) {
3951
+ if (this.state === s) return;
3952
+ this.state = s;
3953
+ this.options.onState(s);
3954
+ if (s !== "speaking" && s !== "thinking") {
3955
+ for (const r of this.idleWaiters.splice(0)) r();
3956
+ }
3957
+ }
3958
+ /** Resolve when the engine is no longer speaking (immediate if already idle). */
3959
+ awaitIdle() {
3960
+ if (this.state !== "speaking" && this.state !== "thinking") return Promise.resolve();
3961
+ return new Promise((r) => this.idleWaiters.push(r));
3962
+ }
3963
+ // --- speaking side (host-driven) ---
3964
+ /** open a spoken turn (idempotent — safe from both onUtterance and first-delta paths).
3965
+ * `ack` speaks the configured micro-ack as the context opener (utterance path only —
3966
+ * masks LLM TTFT; re-voice turns begun by their first delta skip it). */
3967
+ beginSpeech(ack = false) {
3968
+ if (this.speaking && this.ctxOpen) return;
3969
+ if (this.drainTimer) {
3970
+ clearTimeout(this.drainTimer);
3971
+ this.drainTimer = null;
3972
+ }
3973
+ this.interrupted = false;
3974
+ if (!this.speaking) this.player.markTurn();
3975
+ this.speaking = true;
3976
+ this.ctxOpen = true;
3977
+ this.spokeDeltas = false;
3978
+ this.reply = "";
3979
+ this.echoWords = new Set(this.words(this.prevReply));
3980
+ this.tts.newContext();
3981
+ if (ack && this.options.ackPhrase) {
3982
+ this.tts.speak(this.options.ackPhrase + " ", true);
3983
+ this.spokeDeltas = true;
3984
+ this.ackAt = now();
3985
+ }
3986
+ this.setState("thinking");
3987
+ }
3988
+ speakDelta(text) {
3989
+ if (this.interrupted) return;
3990
+ if (!this.speaking || !this.ctxOpen) this.beginSpeech();
3991
+ this.reply += text;
3992
+ for (const w of this.words(this.reply)) this.echoWords.add(w);
3993
+ this.tts.speak(text, true);
3994
+ this.spokeDeltas = true;
3995
+ this.setState("speaking");
3996
+ }
3997
+ /** close the spoken turn (idempotent); stays audible until ALL audio arrived AND playback drains */
3998
+ endSpeech() {
3999
+ this.interrupted = false;
4000
+ if (!this.speaking) return;
4001
+ this.ctxOpen = false;
4002
+ if (this.reply) this.prevReply = this.reply;
4003
+ const settle = () => {
4004
+ if (this.ctxOpen) {
4005
+ this.drainTimer = null;
4006
+ return;
4007
+ }
4008
+ this.drainTimer = null;
4009
+ this.speaking = false;
4010
+ this.echoUntil = now() + 2500;
4011
+ if (!this.usingAec) this.stt.reset();
4012
+ this.setState("listening");
4013
+ };
4014
+ const drainThenSettle = () => {
4015
+ if (this.drainTimer) clearTimeout(this.drainTimer);
4016
+ this.drainTimer = setTimeout(settle, this.player.drainMs() + 300);
4017
+ };
4018
+ if (this.spokeDeltas) {
4019
+ this.tts.onDone = drainThenSettle;
4020
+ this.tts.end();
4021
+ if (this.drainTimer) clearTimeout(this.drainTimer);
4022
+ this.drainTimer = setTimeout(drainThenSettle, 15e3);
4023
+ } else drainThenSettle();
4024
+ }
4025
+ /** text of the reply cut by the last barge-in — consumed by the host to tell the model what
4026
+ * the user did NOT hear. Cleared on read. */
4027
+ takeInterruptedReply() {
4028
+ const r = this.lastInterrupted;
4029
+ this.lastInterrupted = null;
4030
+ return r;
4031
+ }
4032
+ /** barge-in: stop audio NOW, cancel generation, reset for the user's utterance */
4033
+ interrupt() {
4034
+ if (!this.speaking && !this.drainTimer) return;
4035
+ if (this.drainTimer) {
4036
+ clearTimeout(this.drainTimer);
4037
+ this.drainTimer = null;
4038
+ }
4039
+ const heardChars = Math.round(Math.max(0, this.player.playedMs()) / 1e3 * 15);
4040
+ if (this.reply) this.lastInterrupted = { full: this.reply, heard: this.reply.slice(0, heardChars) };
4041
+ this.speaking = false;
4042
+ this.ctxOpen = false;
4043
+ this.interrupted = true;
4044
+ this.suspectUntil = 0;
4045
+ this.echoUntil = now() + 2500;
4046
+ this.tts.cancel();
4047
+ this.player.kill();
4048
+ if (!this.usingAec) this.stt.reset();
4049
+ if (this.reply) this.prevReply = this.reply;
4050
+ this.setState("listening");
4051
+ }
4052
+ stop() {
4053
+ if (this.pendingTimer) clearTimeout(this.pendingTimer);
4054
+ if (this.drainTimer) clearTimeout(this.drainTimer);
4055
+ this.stt.stop();
4056
+ this.player.kill();
4057
+ this.tts.close();
4058
+ this.setState("idle");
4059
+ }
4060
+ // --- listening side (STT-driven) ---
4061
+ words(s) {
4062
+ return s.toLowerCase().replace(/[^a-z0-9\s]/g, "").split(/\s+/).filter((w) => w.length >= 2);
4063
+ }
4064
+ novelWords(text) {
4065
+ return this.words(text).filter((w) => !this.echoWords.has(w));
4066
+ }
4067
+ echoActive() {
4068
+ return this.speaking || now() < this.echoUntil;
4069
+ }
4070
+ handlePartial(text) {
4071
+ if (this.speaking) {
4072
+ const barge = this.novelWords(text).length >= (this.usingAec ? 1 : this.suspectUntil ? 1 : 2);
4073
+ if (barge) {
4074
+ const phase = this.ctxOpen ? "speaking" : "drain";
4075
+ this.interrupt();
4076
+ this.options.onBargeIn(phase);
4077
+ }
4078
+ return;
4079
+ }
4080
+ if (this.pendingUtt && text.trim()) {
4081
+ if (this.pendingTimer) {
4082
+ clearTimeout(this.pendingTimer);
4083
+ this.pendingTimer = null;
4084
+ }
4085
+ }
4086
+ if (!this.echoActive() || this.novelWords(text).length >= 1) this.options.onPartial(text);
4087
+ }
4088
+ handleUtterance(text) {
4089
+ if (this.echoActive() && this.novelWords(text).length < (this.usingAec ? 1 : 2)) {
4090
+ this.stt.reset();
4091
+ return;
4092
+ }
4093
+ const squash = (t) => t.toLowerCase().replace(/[^a-z]/g, "").replace(/(.)\1+/g, "$1");
4094
+ if (this.ackAt && now() - this.ackAt < 6e3 && squash(text) === squash(this.options.ackPhrase)) {
4095
+ this.ackAt = 0;
4096
+ return;
4097
+ }
4098
+ this.pendingUtt = this.pendingUtt ? `${this.pendingUtt} ${text}` : text;
4099
+ if (this.pendingTimer) clearTimeout(this.pendingTimer);
4100
+ if (!this.options.utteranceMergeMs) return this.flushUtterance();
4101
+ this.pendingTimer = setTimeout(() => this.flushUtterance(), this.options.utteranceMergeMs);
4102
+ }
4103
+ flushUtterance() {
4104
+ if (this.pendingTimer) {
4105
+ clearTimeout(this.pendingTimer);
4106
+ this.pendingTimer = null;
4107
+ }
4108
+ const text = this.pendingUtt;
4109
+ this.pendingUtt = "";
4110
+ if (text) this.options.onUtterance(text);
4111
+ }
4112
+ /** energy two-stage barge-in (heuristic tier only): spike over echo baseline → pause + confirm via STT */
4113
+ handleLevel(rms) {
4114
+ if (this.usingAec) return;
4115
+ if (!this.speaking) {
4116
+ this.baseline = 0;
4117
+ this.hot = 0;
4118
+ return;
4119
+ }
4120
+ if (!this.baseline) {
4121
+ this.baseline = rms;
4122
+ return;
4123
+ }
4124
+ this.baseline = this.baseline * 0.9 + rms * 0.1;
4125
+ if (rms > Math.max(this.baseline * this.options.bargeRmsMult, this.options.bargeRmsFloor)) this.hot++;
4126
+ else this.hot = 0;
4127
+ if (this.hot >= 2 && !this.suspectUntil) {
4128
+ this.suspectUntil = now() + 1300;
4129
+ setTimeout(() => {
4130
+ this.suspectUntil = 0;
4131
+ }, 1350);
4132
+ }
4133
+ }
4134
+ };
4135
+
4136
+ // src/voice/soniox.ts
4137
+ init_logging();
4138
+
4139
+ // src/voice/types.ts
4140
+ var STT_SAMPLE_RATE = 16e3;
4141
+ var TTS_SAMPLE_RATE = 44100;
4142
+ async function resolveAuth(auth) {
4143
+ return typeof auth === "function" ? await auth() : auth;
4144
+ }
4145
+
4146
+ // src/voice/soniox.ts
4147
+ var log9 = forComponent("SonioxSTT");
4148
+ var now2 = () => performance.now();
4149
+ var SonioxSTTOptions = class {
4150
+ auth = "";
4151
+ source;
4152
+ model = "stt-rt-preview";
4153
+ languageHints = ["en"];
4154
+ };
4155
+ var SonioxSTT = class {
4156
+ options;
4157
+ ws;
4158
+ stopped = false;
4159
+ sourceStarted = false;
4160
+ onPartial = () => {
4161
+ };
4162
+ onUtterance = () => {
4163
+ };
4164
+ /** mic energy (RMS) per chunk — drives the energy-based heuristic barge-in tier */
4165
+ onLevel = () => {
4166
+ };
4167
+ finalText = "";
4168
+ partialText = "";
4169
+ constructor(options) {
4170
+ this.options = { ...new SonioxSTTOptions(), ...options };
4171
+ }
4172
+ get usingAec() {
4173
+ return this.options.source?.aec ?? false;
4174
+ }
4175
+ async connectWs() {
4176
+ const apiKey = await resolveAuth(this.options.auth);
4177
+ this.ws = new WebSocket("wss://stt-rt.soniox.com/transcribe-websocket");
4178
+ await new Promise((res, rej) => {
4179
+ this.ws.onopen = () => res();
4180
+ this.ws.onerror = (e) => rej(new Error(`soniox ws: ${e.message || "connect failed"}`));
4181
+ });
4182
+ this.ws.send(
4183
+ JSON.stringify({
4184
+ api_key: apiKey,
4185
+ model: this.options.model,
4186
+ audio_format: "pcm_s16le",
4187
+ sample_rate: STT_SAMPLE_RATE,
4188
+ num_channels: 1,
4189
+ language_hints: this.options.languageHints,
4190
+ enable_endpoint_detection: true
4191
+ })
4192
+ );
4193
+ this.ws.onmessage = (ev) => this.handle(JSON.parse(String(ev.data)));
4194
+ this.ws.onclose = (ev) => {
4195
+ if (this.stopped) return;
4196
+ log9.warn(`soniox ws closed (${ev.code} ${ev.reason || ""}) \u2014 reconnecting`);
4197
+ this.reset();
4198
+ this.connectWs().catch((e) => log9.error(`soniox reconnect failed: ${e.message}`));
4199
+ };
4200
+ }
4201
+ async start() {
4202
+ await this.connectWs();
4203
+ if (this.sourceStarted) return;
4204
+ this.sourceStarted = true;
4205
+ await this.options.source.start((chunk) => {
4206
+ let sum = 0;
4207
+ const view = new DataView(chunk.buffer, chunk.byteOffset, chunk.byteLength);
4208
+ for (let i = 0; i + 1 < chunk.byteLength; i += 2) {
4209
+ const v = view.getInt16(i, true);
4210
+ sum += v * v;
4211
+ }
4212
+ this.onLevel(Math.sqrt(sum / (chunk.byteLength / 2)));
4213
+ if (this.ws.readyState === WebSocket.OPEN) this.ws.send(chunk);
4214
+ });
4215
+ }
4216
+ handle(m) {
4217
+ if (m.error_message) return log9.error(`soniox: ${m.error_message}`);
4218
+ let endpoint = false;
4219
+ for (const t of m.tokens ?? []) {
4220
+ if (t.text === "<end>") endpoint = true;
4221
+ else if (t.is_final) this.finalText += t.text;
4222
+ }
4223
+ this.partialText = (m.tokens ?? []).filter((t) => !t.is_final && t.text !== "<end>").map((t) => t.text).join("");
4224
+ this.onPartial(this.finalText + this.partialText);
4225
+ if (endpoint && this.finalText.trim()) {
4226
+ const utterance = this.finalText.trim();
4227
+ this.reset();
4228
+ this.onUtterance(utterance, now2());
4229
+ }
4230
+ }
4231
+ reset() {
4232
+ this.finalText = "";
4233
+ this.partialText = "";
4234
+ }
4235
+ stop() {
4236
+ this.stopped = true;
4237
+ this.options.source?.stop();
4238
+ if (this.ws) this.ws.onclose = null;
4239
+ this.ws?.close();
4240
+ }
4241
+ };
4242
+
4243
+ // src/voice/cartesia.ts
4244
+ init_logging();
4245
+ var log10 = forComponent("CartesiaTTS");
4246
+ var now3 = () => performance.now();
4247
+ var CartesiaTTSOptions = class {
4248
+ auth = "";
4249
+ voiceId = "";
4250
+ model = "sonic-3.5";
4251
+ /** 'apiKey' (server/CLI) → `api_key=` URL param; 'token' (browser, BE-minted) → `access_token=`. */
4252
+ authMode = "apiKey";
4253
+ };
4254
+ var CartesiaTTS = class {
4255
+ options;
4256
+ ws;
4257
+ ctxSeq = 0;
4258
+ ctxId = "";
4259
+ onAudio = () => {
4260
+ };
4261
+ onDone = () => {
4262
+ };
4263
+ firstAudioAt = 0;
4264
+ constructor(options) {
4265
+ this.options = { ...new CartesiaTTSOptions(), ...options };
4266
+ }
4267
+ async connect() {
4268
+ const key = await resolveAuth(this.options.auth);
4269
+ const param = this.options.authMode === "token" ? "access_token" : "api_key";
4270
+ this.ws = new WebSocket(`wss://api.cartesia.ai/tts/websocket?cartesia_version=2026-03-01&${param}=${key}`);
4271
+ await new Promise((res, rej) => {
4272
+ this.ws.onopen = () => res();
4273
+ this.ws.onerror = (e) => rej(new Error(`cartesia ws: ${e.message || "connect failed"}`));
4274
+ });
4275
+ this.ws.onclose = (ev) => log10.warn(`cartesia ws closed (${ev.code} ${ev.reason || ""})`);
4276
+ this.ws.onmessage = (ev) => {
4277
+ const m = JSON.parse(String(ev.data));
4278
+ if (m.context_id && m.context_id !== this.ctxId) return;
4279
+ if (m.type === "chunk" && m.data) {
4280
+ if (!this.firstAudioAt) this.firstAudioAt = now3();
4281
+ this.onAudio(base64ToBytes(m.data));
4282
+ } else if (m.type === "done") this.onDone();
4283
+ else if (m.type === "error" && !/already been cancelled|does not exist/.test(m.message || "")) log10.warn(`cartesia: ${JSON.stringify(m)}`);
4284
+ };
4285
+ }
4286
+ newContext() {
4287
+ this.ctxId = `ctx-${++this.ctxSeq}`;
4288
+ this.firstAudioAt = 0;
4289
+ return this.ctxId;
4290
+ }
4291
+ frame(transcript, cont) {
4292
+ return JSON.stringify({
4293
+ model_id: this.options.model,
4294
+ transcript,
4295
+ voice: { mode: "id", id: this.options.voiceId },
4296
+ output_format: { container: "raw", encoding: "pcm_s16le", sample_rate: TTS_SAMPLE_RATE },
4297
+ context_id: this.ctxId,
4298
+ continue: cont
4299
+ });
4300
+ }
4301
+ speak(text, cont) {
4302
+ if (this.ws?.readyState === WebSocket.OPEN) this.ws.send(this.frame(text, cont));
4303
+ }
4304
+ end() {
4305
+ if (this.ws?.readyState === WebSocket.OPEN) this.ws.send(this.frame("", false));
4306
+ }
4307
+ cancel() {
4308
+ if (this.ws?.readyState === WebSocket.OPEN) this.ws.send(JSON.stringify({ context_id: this.ctxId, cancel: true }));
4309
+ }
4310
+ close() {
4311
+ if (this.ws) this.ws.onclose = null;
4312
+ this.ws?.close();
4313
+ }
4314
+ };
4315
+ function base64ToBytes(b64) {
4316
+ if (typeof Buffer !== "undefined") return Buffer.from(b64, "base64");
4317
+ const bin = atob(b64);
4318
+ const out = new Uint8Array(bin.length);
4319
+ for (let i = 0; i < bin.length; i++) out[i] = bin.charCodeAt(i);
4320
+ return out;
4321
+ }
4322
+
4323
+ // src/index.ts
3764
4324
  import { MemFilesystem as MemFilesystem3, IndexedDbFilesystem, CommandExecutor as CommandExecutor2, registerHeadlessCommands as registerHeadlessCommands2 } from "@livx.cc/wcli/core";
3765
4325
  export {
3766
4326
  Agent,
3767
4327
  AgentOptions,
3768
4328
  BodDbFilesystem,
4329
+ CartesiaTTS,
4330
+ CartesiaTTSOptions,
3769
4331
  CommandExecutor2 as CommandExecutor,
3770
4332
  ConsoleHostBridge,
3771
4333
  DEFAULT_DENY,
@@ -3785,9 +4347,15 @@ export {
3785
4347
  PermissionPolicy,
3786
4348
  RecordingHooks,
3787
4349
  RecordingLifecycle,
4350
+ STT_SAMPLE_RATE,
3788
4351
  SandboxJobRegistry,
3789
4352
  ScriptedHostBridge,
4353
+ SonioxSTT,
4354
+ SonioxSTTOptions,
4355
+ TTS_SAMPLE_RATE,
3790
4356
  VOICE_SYSTEM_PROMPT,
4357
+ VoiceEngine,
4358
+ VoiceEngineOptions,
3791
4359
  applyEditsTool,
3792
4360
  askUserQuestionTool,
3793
4361
  bashTool,
@@ -3799,6 +4367,7 @@ export {
3799
4367
  defaultTools,
3800
4368
  diskAgentOptions,
3801
4369
  editTool,
4370
+ exitSessionTool,
3802
4371
  expandCommand,
3803
4372
  expandTemplate,
3804
4373
  forComponent,
@@ -3835,6 +4404,7 @@ export {
3835
4404
  relevanceScore,
3836
4405
  repoIndex,
3837
4406
  repoMapTool,
4407
+ resolveAuth,
3838
4408
  rollbackTool,
3839
4409
  sandboxAgentOptions,
3840
4410
  slugify,