agent.libx.js 0.89.9 → 0.92.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/cli/cli.ts +1981 -0
- package/dist/{Agent-B0l9qT_j.d.ts → Agent-BzwprwHr.d.ts} +1 -1
- package/dist/cli.d.ts +2 -2
- package/dist/cli.js +1208 -192
- package/dist/cli.js.map +1 -1
- package/dist/index.d.ts +211 -7
- package/dist/index.js +645 -14
- package/dist/index.js.map +1 -1
- package/dist/{mcp-Dg3vA1Uj.d.ts → mcp-Bn5TlRbV.d.ts} +10 -2
- package/dist/mcp.client.d.ts +2 -2
- package/dist/mcp.client.js +19 -7
- package/dist/mcp.client.js.map +1 -1
- package/dist/{tools-Ch-OzOU8.d.ts → tools-CeK5AquG.d.ts} +11 -2
- package/dist/tools.shell.d.ts +1 -1
- package/dist/tools.shell.js.map +1 -1
- package/package.json +2 -2
package/dist/index.js
CHANGED
|
@@ -851,6 +851,17 @@ ${out}`.trim() : out || "(command succeeded, no output)";
|
|
|
851
851
|
);
|
|
852
852
|
return `Started background job ${id} \u2014 poll with JobOutput({id:"${id}"}) / JobStatus, stop with JobKill.`;
|
|
853
853
|
}
|
|
854
|
+
function exitSessionTool(onExit) {
|
|
855
|
+
return {
|
|
856
|
+
name: "ExitSession",
|
|
857
|
+
description: `End the current session and exit the CLI. Call this when the user says goodbye, asks to quit, or clearly indicates they want to stop the conversation (e.g. "ok bye", "that's all", "exit", "goodnight").`,
|
|
858
|
+
parameters: { type: "object", properties: {} },
|
|
859
|
+
async run() {
|
|
860
|
+
onExit();
|
|
861
|
+
return "Session ending. Goodbye!";
|
|
862
|
+
}
|
|
863
|
+
};
|
|
864
|
+
}
|
|
854
865
|
function defaultTools() {
|
|
855
866
|
return [bashTool, readTool, editTool];
|
|
856
867
|
}
|
|
@@ -2835,7 +2846,15 @@ var Agent = class _Agent {
|
|
|
2835
2846
|
toolCallsTotal += toolCalls.length;
|
|
2836
2847
|
if (o.maxToolCalls && toolCallsTotal > o.maxToolCalls) return kill("max_tool_calls");
|
|
2837
2848
|
for (const tc of toolCalls) {
|
|
2838
|
-
const
|
|
2849
|
+
const raw = await this.dispatch(tc);
|
|
2850
|
+
let content;
|
|
2851
|
+
if (typeof raw === "string") {
|
|
2852
|
+
content = raw;
|
|
2853
|
+
} else {
|
|
2854
|
+
const parts = [{ type: "text", text: raw.text }];
|
|
2855
|
+
for (const img of raw.images ?? []) parts.push(imagePart(`data:${img.mimeType};base64,${img.data}`));
|
|
2856
|
+
content = parts;
|
|
2857
|
+
}
|
|
2839
2858
|
this.transcript.push({ role: "tool", tool_call_id: tc.id, name: tc.function.name, content });
|
|
2840
2859
|
}
|
|
2841
2860
|
}
|
|
@@ -2892,10 +2911,17 @@ var Agent = class _Agent {
|
|
|
2892
2911
|
return earlyError;
|
|
2893
2912
|
}
|
|
2894
2913
|
let result;
|
|
2914
|
+
let images;
|
|
2895
2915
|
let threw = false;
|
|
2896
2916
|
try {
|
|
2897
2917
|
log3.debug(`${tc.function.name}(${tc.function.arguments})`);
|
|
2898
|
-
|
|
2918
|
+
const raw = await tool.run(args, this.ctx);
|
|
2919
|
+
if (typeof raw === "string") {
|
|
2920
|
+
result = raw;
|
|
2921
|
+
} else {
|
|
2922
|
+
result = raw.text;
|
|
2923
|
+
images = raw.images;
|
|
2924
|
+
}
|
|
2899
2925
|
} catch (e) {
|
|
2900
2926
|
const msg = e instanceof Error ? e.message : String(e);
|
|
2901
2927
|
log3.debug(`${tc.function.name} -> error: ${msg}`);
|
|
@@ -2905,7 +2931,12 @@ var Agent = class _Agent {
|
|
|
2905
2931
|
if (!threw) result = await this.maybeAutoTest(tc.function.name, result);
|
|
2906
2932
|
await hooks?.postToolUse?.(call, result, meta);
|
|
2907
2933
|
this.options.host?.notify?.({ kind: "tool_result", id: tc.id ?? "", output: result, isError: threw });
|
|
2908
|
-
|
|
2934
|
+
if (images?.length) {
|
|
2935
|
+
for (const img of images) {
|
|
2936
|
+
this.options.host?.notify?.({ kind: "tool_result_image", id: tc.id ?? "", dataUrl: `data:${img.mimeType};base64,${img.data}` });
|
|
2937
|
+
}
|
|
2938
|
+
}
|
|
2939
|
+
return images?.length ? { text: result, images } : result;
|
|
2909
2940
|
}
|
|
2910
2941
|
static WRITE_CLASS = ["Write", "Edit", "MultiEdit", "ApplyEdits"];
|
|
2911
2942
|
/** Append an autoTest failure section to a write-class tool result, if configured. */
|
|
@@ -3452,6 +3483,11 @@ function digestRun(messages, maxChars) {
|
|
|
3452
3483
|
import { MemFilesystem as MemFilesystem2 } from "@livx.cc/wcli/core";
|
|
3453
3484
|
init_logging();
|
|
3454
3485
|
var log7 = forComponent("DuplexAgent");
|
|
3486
|
+
function describeCall(call) {
|
|
3487
|
+
const v = call.args && Object.values(call.args).find((x) => typeof x === "string" && x.trim());
|
|
3488
|
+
const hint = v ? ` (${String(v).replace(/\s+/g, " ").trim().slice(0, 48)})` : "";
|
|
3489
|
+
return `${call.name}${hint}`;
|
|
3490
|
+
}
|
|
3455
3491
|
var DuplexAgentOptions = class {
|
|
3456
3492
|
/** Any ai.libx.js AIClient — shared by the voice and worker agents (routed by model). */
|
|
3457
3493
|
ai;
|
|
@@ -3472,8 +3508,16 @@ var DuplexAgentOptions = class {
|
|
|
3472
3508
|
/** Awaited BEFORE a delegated worker spawns — open a per-task checkpoint frame, audit, etc.
|
|
3473
3509
|
* (post-spawn would race the worker's first edits). */
|
|
3474
3510
|
onTaskStart;
|
|
3511
|
+
/** Re-voice throttled worker progress asides ('[task t1 progress] …') so long tasks aren't dead
|
|
3512
|
+
* air. Off by default — each update costs a voice turn (LLM call + speech). */
|
|
3513
|
+
progressUpdates = false;
|
|
3514
|
+
/** Min ms between progress re-voices per task. */
|
|
3515
|
+
progressIntervalMs = 25e3;
|
|
3516
|
+
/** Host overrides for QuickLook lookups (keyed by `what`). The engine's defaults go through the
|
|
3517
|
+
* (possibly jailed) fs — e.g. `.git/**` is deny-listed, so the CLI supplies 'branch' itself. */
|
|
3518
|
+
quickLook;
|
|
3475
3519
|
};
|
|
3476
|
-
var VOICE_SYSTEM_PROMPT = 'You are a spoken voice assistant \u2014 the user HEARS everything you say. Use short sentences. One idea per sentence. No markdown, no bullet lists, no code blocks, no headings, no emoji.\nKeep turns SHORT \u2014 one to three sentences, then stop. Never lecture, enumerate cases, or add caveats unprompted. Conversation is a fast exchange: give the one thing asked, and let the user pull more if they want it.\nYou work in a pair: you talk, and a background worker with FULL access to the user\'s environment (files, shell, web) does the hands-on work. You can find out or do ANYTHING by calling `Delegate` with a clear, self-contained brief \u2014 so NEVER tell the user you can\'t see, access, or do something. Delegate and find out. When the user mentions their project, folder, files, or environment ("this project", "the current folder", "my code"), delegate IMMEDIATELY \u2014 do not ask for paths or details the worker can discover itself. Never pretend to have done the work or invent results \u2014 the worker\'s report is your only source.\nAfter calling Delegate, tell the user you are on it in one short sentence, then end your turn. Do not wait for the result.\nResults arrive later as events like "[task t1 completed] \u2026" or "[task t1 failed] \u2026". When one arrives, summarize it for the ear in one or two short sentences. Never read raw file paths, diffs, or code aloud verbatim.\nDo not fire a second Delegate for work already in flight \u2014 check `TaskStatus` first. Use `CancelTask` when the user asks to stop something.';
|
|
3520
|
+
var VOICE_SYSTEM_PROMPT = 'You are a spoken voice assistant \u2014 the user HEARS everything you say. Use short sentences. One idea per sentence. No markdown, no bullet lists, no code blocks, no headings, no emoji.\nKeep turns SHORT \u2014 one to three sentences, then stop. Never lecture, enumerate cases, or add caveats unprompted. Conversation is a fast exchange: give the one thing asked, and let the user pull more if they want it.\nYou work in a pair: you talk, and a background worker with FULL access to the user\'s environment (files, shell, web) does the hands-on work. You can find out or do ANYTHING by calling `Delegate` with a clear, self-contained brief \u2014 so NEVER tell the user you can\'t see, access, or do something. Delegate and find out. When the user mentions their project, folder, files, or environment ("this project", "the current folder", "my code"), delegate IMMEDIATELY \u2014 do not ask for paths or details the worker can discover itself. Never pretend to have done the work or invent results \u2014 the worker\'s report is your only source.\nAfter calling Delegate, tell the user you are on it in one short sentence, then end your turn. Do not wait for the result.\nResults arrive later as events like "[task t1 completed] \u2026" or "[task t1 failed] \u2026". When one arrives, summarize it for the ear in one or two short sentences. "[task t1 progress] \u2026" events are interim status, NOT results \u2014 give at most a half-sentence aside ("still on it \u2014 running tests now") and end your turn. Never present progress as a finished result.\nNever read raw file paths, diffs, or code aloud verbatim.\nDo not fire a second Delegate for work already in flight \u2014 check `TaskStatus` first. Use `CancelTask` when the user asks to stop something.\nPRIORITY: when the user says goodbye or wants to end/finish/wrap up the session ("ok bye", "that\'s all", "let\'s finish", "let\'s end", "goodnight", "exit", "wrap up"), call `ExitSession` IMMEDIATELY \u2014 do not delegate, do not check status, just exit.\nFor TRIVIAL instant lookups only \u2014 current time, git branch, listing a folder, peeking at a small file \u2014 use `QuickLook` (instant, no task). Anything requiring searching, reasoning, running commands, or editing still goes through `Delegate`.\nNEVER claim to have stored, saved, or remembered something durably \u2014 you cannot. Anything the user wants persisted (their name, preferences, notes) must be Delegated so a worker writes it to memory.\nUser messages may arrive via speech-to-text and can carry transcription artifacts \u2014 odd words, cut-offs, homophones ("for you" vs "folder"). Read for INTENT, not surface text. If a message seems garbled or surprising, briefly confirm what they meant ("did you mean\u2026?") instead of answering the literal words.';
|
|
3477
3521
|
var VOICE_STYLE_CONVERSATIONAL = `Speak like a person in a live conversation, not an assistant reading a script. React first, then deliver: a quick impulsive beat ("oh nice", "hmm, hold on", "ah, got it") before the substance. Use contractions always. Vary sentence length \u2014 some very short. Light fillers and backchannels are fine ("mm-hm", "right", "let's see") but at most one per reply \u2014 never stack them. When you delegate, say it like a human would ("hang on, let me actually dig into that \u2014 gimme a minute") instead of announcing a task. When a result comes back, react to it like you just found out ("okay so \u2014 turns out\u2026"). Match the user's energy: a quick question gets a quick answer \u2014 a few words is a perfectly good turn. Prefer a short answer plus an offer ("want the details?") over covering everything. Never narrate your own mechanics (no "I will now delegate", no task ids out loud).`;
|
|
3478
3522
|
var DuplexAgent = class {
|
|
3479
3523
|
options;
|
|
@@ -3493,7 +3537,10 @@ var DuplexAgent = class {
|
|
|
3493
3537
|
model: o.voiceModel,
|
|
3494
3538
|
stream: true,
|
|
3495
3539
|
host: o.host,
|
|
3496
|
-
|
|
3540
|
+
// Runtime context line: without it the voice confidently invents "facts" like today's date
|
|
3541
|
+
// (its training cutoff) instead of delegating or admitting it doesn't know.
|
|
3542
|
+
systemPrompt: VOICE_SYSTEM_PROMPT + (o.voiceStyle === "conversational" ? "\n" + VOICE_STYLE_CONVERSATIONAL : "") + `
|
|
3543
|
+
Today's date: ${(/* @__PURE__ */ new Date()).toDateString()}.`,
|
|
3497
3544
|
instructionFiles: false,
|
|
3498
3545
|
maxSteps: 8,
|
|
3499
3546
|
// a voice turn should never loop
|
|
@@ -3502,7 +3549,7 @@ var DuplexAgent = class {
|
|
|
3502
3549
|
// no defaultTools() — the voice can only Delegate, never touch files itself. Set AFTER the
|
|
3503
3550
|
// voiceOptions spread (addTools() would be clobbered by the first prepare()); extra voice
|
|
3504
3551
|
// tools come in via voiceOptions.tools and are merged here.
|
|
3505
|
-
tools: [...o.voiceOptions?.tools ?? [], this.delegateTool(), this.taskStatusTool(), this.cancelTaskTool()]
|
|
3552
|
+
tools: [...o.voiceOptions?.tools ?? [], this.delegateTool(), this.taskStatusTool(), this.cancelTaskTool(), this.quickLookTool()]
|
|
3506
3553
|
});
|
|
3507
3554
|
}
|
|
3508
3555
|
/** One user turn: the voice agent streams the reply (and may Delegate). Serialized with re-voice turns. */
|
|
@@ -3555,18 +3602,69 @@ ${recent}` : brief;
|
|
|
3555
3602
|
spawnWorker(id, label, briefText) {
|
|
3556
3603
|
const o = this.options;
|
|
3557
3604
|
const controller = new AbortController();
|
|
3605
|
+
const base = o.workerOptions?.hooks;
|
|
3606
|
+
const report = o.progressUpdates ? this.progressReporter(id) : void 0;
|
|
3607
|
+
const hooks = report ? {
|
|
3608
|
+
...base,
|
|
3609
|
+
preToolUse: async (call, meta) => {
|
|
3610
|
+
const d = await base?.preToolUse?.(call, meta);
|
|
3611
|
+
report.pre(call);
|
|
3612
|
+
return d;
|
|
3613
|
+
},
|
|
3614
|
+
postToolUse: async (call, result, meta) => {
|
|
3615
|
+
await base?.postToolUse?.(call, result, meta);
|
|
3616
|
+
report.post(call);
|
|
3617
|
+
}
|
|
3618
|
+
} : base;
|
|
3558
3619
|
const worker = new Agent({
|
|
3559
3620
|
ai: o.ai,
|
|
3560
3621
|
fs: o.fs,
|
|
3561
3622
|
model: o.workerModel,
|
|
3562
3623
|
...o.workerOptions,
|
|
3563
3624
|
// may override ai/fs/model/tools/… —
|
|
3625
|
+
...hooks ? { hooks } : {},
|
|
3564
3626
|
signal: controller.signal
|
|
3565
3627
|
// …but never the per-task cancellation signal
|
|
3566
3628
|
});
|
|
3567
3629
|
const promise = worker.run(briefText).then((res) => this.onWorkerSettled(id, res)).catch((err) => this.onWorkerFailed(id, err));
|
|
3568
3630
|
this.tasks.set(id, { id, label, status: "running", controller, promise });
|
|
3569
3631
|
}
|
|
3632
|
+
/** Throttled per-task progress: worker tool calls → at most one progress re-voice per interval.
|
|
3633
|
+
* Two sources, one throttle: completed steps (post) and a heartbeat for a SINGLE long tool call
|
|
3634
|
+
* (pre records the in-flight call; a self-cleaning timer narrates "still inside Bash — 70s").
|
|
3635
|
+
* Completion supersedes: nothing is emitted once the task has settled. */
|
|
3636
|
+
progressReporter(id) {
|
|
3637
|
+
let lastAt = Date.now();
|
|
3638
|
+
let steps = 0;
|
|
3639
|
+
let inflight = null;
|
|
3640
|
+
const due = () => {
|
|
3641
|
+
const rec = this.tasks.get(id);
|
|
3642
|
+
return rec && rec.status === "running" && Date.now() - lastAt >= this.options.progressIntervalMs ? rec : void 0;
|
|
3643
|
+
};
|
|
3644
|
+
const emit = (rec, line, call) => {
|
|
3645
|
+
lastAt = Date.now();
|
|
3646
|
+
this.notify("task_progress", `task ${id} (${rec.label}): ${line}`, { id, steps, call: call.name });
|
|
3647
|
+
this.queueRevoice(`[task ${id} progress] ${line}`);
|
|
3648
|
+
};
|
|
3649
|
+
const timer = setInterval(() => {
|
|
3650
|
+
const rec = this.tasks.get(id);
|
|
3651
|
+
if (!rec || rec.status !== "running") return clearInterval(timer);
|
|
3652
|
+
if (!inflight || !due()) return;
|
|
3653
|
+
emit(rec, `still inside ${describeCall(inflight.call)} \u2014 ${Math.round((Date.now() - inflight.at) / 1e3)}s on this step`, inflight.call);
|
|
3654
|
+
}, Math.max(this.options.progressIntervalMs, 250));
|
|
3655
|
+
timer.unref?.();
|
|
3656
|
+
return {
|
|
3657
|
+
pre: (call) => {
|
|
3658
|
+
inflight = { call, at: Date.now() };
|
|
3659
|
+
},
|
|
3660
|
+
post: (call) => {
|
|
3661
|
+
steps++;
|
|
3662
|
+
inflight = null;
|
|
3663
|
+
const rec = due();
|
|
3664
|
+
if (rec) emit(rec, `still running \u2014 ${steps} steps so far, now: ${describeCall(call)}`, call);
|
|
3665
|
+
}
|
|
3666
|
+
};
|
|
3667
|
+
}
|
|
3570
3668
|
onWorkerSettled(id, res) {
|
|
3571
3669
|
const rec = this.tasks.get(id);
|
|
3572
3670
|
if (res.finishReason === "aborted" || rec.status === "cancelled") {
|
|
@@ -3627,6 +3725,58 @@ ${recent}` : brief;
|
|
|
3627
3725
|
}
|
|
3628
3726
|
};
|
|
3629
3727
|
}
|
|
3728
|
+
/** Sub-100ms read-only lookups the voice may do itself — everything else stays Delegate-only.
|
|
3729
|
+
* fs-only (no shell; the engine is VFS-abstracted): time, git branch (.git/HEAD read), ls, file
|
|
3730
|
+
* head. Output is hard-capped so a lookup can never bloat the skinny voice context. */
|
|
3731
|
+
quickLookTool() {
|
|
3732
|
+
const CAP = 2e3;
|
|
3733
|
+
const kinds = [.../* @__PURE__ */ new Set(["time", "branch", "ls", "file", ...Object.keys(this.options.quickLook ?? {})])];
|
|
3734
|
+
return {
|
|
3735
|
+
name: "QuickLook",
|
|
3736
|
+
description: `Instant read-only lookup \u2014 one of: ${kinds.join(", ")}. For trivial facts only; anything needing search, commands, or reasoning goes through Delegate.`,
|
|
3737
|
+
parameters: {
|
|
3738
|
+
type: "object",
|
|
3739
|
+
required: ["what"],
|
|
3740
|
+
properties: {
|
|
3741
|
+
what: { type: "string", enum: kinds, description: "what to look up" },
|
|
3742
|
+
path: { type: "string", description: "for ls/file: the path to look at" }
|
|
3743
|
+
}
|
|
3744
|
+
},
|
|
3745
|
+
run: async ({ what, path }) => {
|
|
3746
|
+
const fs = this.options.fs;
|
|
3747
|
+
try {
|
|
3748
|
+
const over = this.options.quickLook?.[String(what)];
|
|
3749
|
+
if (over) return await over(path ? String(path) : void 0);
|
|
3750
|
+
switch (String(what)) {
|
|
3751
|
+
case "time":
|
|
3752
|
+
return (/* @__PURE__ */ new Date()).toString();
|
|
3753
|
+
case "branch": {
|
|
3754
|
+
if (!fs) return "unavailable (no filesystem)";
|
|
3755
|
+
const head = (await fs.readFile(".git/HEAD")).trim();
|
|
3756
|
+
return head.startsWith("ref: refs/heads/") ? `branch: ${head.slice("ref: refs/heads/".length)}` : `detached HEAD at ${head.slice(0, 12)}`;
|
|
3757
|
+
}
|
|
3758
|
+
case "ls": {
|
|
3759
|
+
if (!fs) return "unavailable (no filesystem)";
|
|
3760
|
+
const names = await fs.readDir(String(path ?? "."));
|
|
3761
|
+
return names.slice(0, 50).join("\n") + (names.length > 50 ? `
|
|
3762
|
+
\u2026 (+${names.length - 50} more)` : "");
|
|
3763
|
+
}
|
|
3764
|
+
case "file": {
|
|
3765
|
+
if (!fs) return "unavailable (no filesystem)";
|
|
3766
|
+
if (!path) return "file lookup needs a path";
|
|
3767
|
+
const text = await fs.readFile(String(path));
|
|
3768
|
+
return text.length > CAP ? text.slice(0, CAP) + `
|
|
3769
|
+
\u2026 (truncated \u2014 ${text.length} chars total; Delegate for the full file)` : text;
|
|
3770
|
+
}
|
|
3771
|
+
default:
|
|
3772
|
+
return `unknown lookup '${what}'`;
|
|
3773
|
+
}
|
|
3774
|
+
} catch (e) {
|
|
3775
|
+
return `lookup failed: ${e?.message ?? e}`;
|
|
3776
|
+
}
|
|
3777
|
+
}
|
|
3778
|
+
};
|
|
3779
|
+
}
|
|
3630
3780
|
cancelTaskTool() {
|
|
3631
3781
|
return {
|
|
3632
3782
|
name: "CancelTask",
|
|
@@ -3645,15 +3795,26 @@ ${recent}` : brief;
|
|
|
3645
3795
|
};
|
|
3646
3796
|
|
|
3647
3797
|
// src/mcp.ts
|
|
3648
|
-
function
|
|
3649
|
-
if (result == null) return "";
|
|
3650
|
-
if (typeof result === "string") return result;
|
|
3798
|
+
function toResult(result) {
|
|
3799
|
+
if (result == null) return { text: "" };
|
|
3800
|
+
if (typeof result === "string") return { text: result };
|
|
3651
3801
|
const content = result.content;
|
|
3652
3802
|
if (Array.isArray(content)) {
|
|
3653
|
-
const
|
|
3654
|
-
|
|
3803
|
+
const texts = [];
|
|
3804
|
+
const images = [];
|
|
3805
|
+
for (const c of content) {
|
|
3806
|
+
if (c?.type === "image" && typeof c.data === "string" && c.mimeType) {
|
|
3807
|
+
images.push({ mimeType: c.mimeType, data: c.data });
|
|
3808
|
+
} else if (typeof c?.text === "string") {
|
|
3809
|
+
texts.push(c.text);
|
|
3810
|
+
} else {
|
|
3811
|
+
texts.push(JSON.stringify(c));
|
|
3812
|
+
}
|
|
3813
|
+
}
|
|
3814
|
+
const text = texts.join("\n");
|
|
3815
|
+
if (text || images.length) return { text, ...images.length ? { images } : {} };
|
|
3655
3816
|
}
|
|
3656
|
-
return JSON.stringify(result);
|
|
3817
|
+
return { text: JSON.stringify(result) };
|
|
3657
3818
|
}
|
|
3658
3819
|
function mcpToolToAgentTool(spec, callTool, prefix = "mcp__") {
|
|
3659
3820
|
return {
|
|
@@ -3661,7 +3822,8 @@ function mcpToolToAgentTool(spec, callTool, prefix = "mcp__") {
|
|
|
3661
3822
|
description: spec.description ?? `MCP tool ${spec.name}`,
|
|
3662
3823
|
parameters: spec.inputSchema ?? { type: "object", properties: {} },
|
|
3663
3824
|
async run(args, _ctx) {
|
|
3664
|
-
|
|
3825
|
+
const r = toResult(await callTool(spec.name, args ?? {}));
|
|
3826
|
+
return r.images?.length ? r : r.text;
|
|
3665
3827
|
}
|
|
3666
3828
|
};
|
|
3667
3829
|
}
|
|
@@ -3703,7 +3865,8 @@ function makeMcpToolSearch(specs, callTool, options = {}) {
|
|
|
3703
3865
|
async run({ name, args }) {
|
|
3704
3866
|
const n = String(name ?? "");
|
|
3705
3867
|
if (!byName.has(n)) return `Error: unknown MCP tool '${n}'. Use ToolSearch to find valid names.`;
|
|
3706
|
-
|
|
3868
|
+
const r = toResult(await callTool(n, args ?? {}));
|
|
3869
|
+
return r.images?.length ? r : r.text;
|
|
3707
3870
|
}
|
|
3708
3871
|
};
|
|
3709
3872
|
return [searchTool, callMcpTool];
|
|
@@ -3761,11 +3924,471 @@ var RecordingLifecycle = class {
|
|
|
3761
3924
|
|
|
3762
3925
|
// src/index.ts
|
|
3763
3926
|
init_logging();
|
|
3927
|
+
|
|
3928
|
+
// src/voice/engine.ts
|
|
3929
|
+
init_logging();
|
|
3930
|
+
var log8 = forComponent("VoiceEngine");
|
|
3931
|
+
var now = () => performance.now();
|
|
3932
|
+
var VoiceEngineOptions = class {
|
|
3933
|
+
stt;
|
|
3934
|
+
tts;
|
|
3935
|
+
player;
|
|
3936
|
+
/** a final utterance arrived (endpoint) — host dispatches it as a turn */
|
|
3937
|
+
onUtterance = () => {
|
|
3938
|
+
};
|
|
3939
|
+
/** live partial transcript while listening (host renders the 🎤 line) */
|
|
3940
|
+
onPartial = () => {
|
|
3941
|
+
};
|
|
3942
|
+
onState = () => {
|
|
3943
|
+
};
|
|
3944
|
+
/** user spoke/acted over playback — host aborts the in-flight turn (called AFTER audio is killed).
|
|
3945
|
+
* phase: 'speaking' = cut mid-speech (real interruption); 'drain' = in the final audio tail
|
|
3946
|
+
* (normal turn-taking — hosts shouldn't alarm). */
|
|
3947
|
+
onBargeIn = () => {
|
|
3948
|
+
};
|
|
3949
|
+
/** spoken micro-ack on utterance endpoint (masks LLM TTFT); '' disables */
|
|
3950
|
+
ackPhrase = "";
|
|
3951
|
+
/** Endpoint merge window (ms): hold an endpointed utterance briefly — if speech resumes (spelled
|
|
3952
|
+
* letters, mid-thought pauses), the next utterance MERGES instead of dispatching a truncated one
|
|
3953
|
+
* ("E-L-Y." / "A."). Costs this much latency per turn; 0 disables. */
|
|
3954
|
+
utteranceMergeMs = 350;
|
|
3955
|
+
/** heuristic (non-AEC) energy barge-in tuning */
|
|
3956
|
+
bargeRmsMult = 2;
|
|
3957
|
+
bargeRmsFloor = 500;
|
|
3958
|
+
};
|
|
3959
|
+
var VoiceEngine = class {
|
|
3960
|
+
options;
|
|
3961
|
+
state = "idle";
|
|
3962
|
+
stt;
|
|
3963
|
+
tts;
|
|
3964
|
+
player;
|
|
3965
|
+
speaking = false;
|
|
3966
|
+
// audible (deltas flowing OR audio draining)
|
|
3967
|
+
ctxOpen = false;
|
|
3968
|
+
// the current TTS context still accepts deltas (false once end-frame sent)
|
|
3969
|
+
interrupted = false;
|
|
3970
|
+
// barge-in latch: drop in-flight deltas until the next legitimate turn
|
|
3971
|
+
spokeDeltas = false;
|
|
3972
|
+
// a TTS context is open for the current spoken turn
|
|
3973
|
+
drainTimer = null;
|
|
3974
|
+
// heuristic tier state (inert under AEC) — frozen as validated in the experiment
|
|
3975
|
+
echoWords = /* @__PURE__ */ new Set();
|
|
3976
|
+
prevReply = "";
|
|
3977
|
+
reply = "";
|
|
3978
|
+
echoUntil = 0;
|
|
3979
|
+
baseline = 0;
|
|
3980
|
+
hot = 0;
|
|
3981
|
+
suspectUntil = 0;
|
|
3982
|
+
ackAt = 0;
|
|
3983
|
+
// when the micro-ack was spoken — its echo can leak before the AEC filter converges
|
|
3984
|
+
pendingUtt = "";
|
|
3985
|
+
// endpointed text held for the merge window
|
|
3986
|
+
pendingTimer = null;
|
|
3987
|
+
lastInterrupted = null;
|
|
3988
|
+
constructor(options) {
|
|
3989
|
+
this.options = { ...new VoiceEngineOptions(), ...options };
|
|
3990
|
+
const o = this.options;
|
|
3991
|
+
if (!o.stt || !o.tts || !o.player) throw new Error("VoiceEngine needs stt, tts and player (see cli/voice.ts VoiceIO for platform defaults)");
|
|
3992
|
+
this.stt = o.stt;
|
|
3993
|
+
this.tts = o.tts;
|
|
3994
|
+
this.player = o.player;
|
|
3995
|
+
}
|
|
3996
|
+
async start() {
|
|
3997
|
+
this.tts.onAudio = (c) => {
|
|
3998
|
+
if (this.speaking) this.player.write(c);
|
|
3999
|
+
};
|
|
4000
|
+
this.stt.onPartial = (text) => this.handlePartial(text);
|
|
4001
|
+
this.stt.onUtterance = (text) => this.handleUtterance(text);
|
|
4002
|
+
this.stt.onLevel = (rms) => this.handleLevel(rms);
|
|
4003
|
+
await Promise.all([this.tts.connect(), this.stt.start()]);
|
|
4004
|
+
this.setState("listening");
|
|
4005
|
+
log8.info(`voice I/O up (${this.stt.usingAec ? "AEC" : "heuristic echo"} capture)`);
|
|
4006
|
+
}
|
|
4007
|
+
get usingAec() {
|
|
4008
|
+
return this.stt.usingAec;
|
|
4009
|
+
}
|
|
4010
|
+
idleWaiters = [];
|
|
4011
|
+
setState(s) {
|
|
4012
|
+
if (this.state === s) return;
|
|
4013
|
+
this.state = s;
|
|
4014
|
+
this.options.onState(s);
|
|
4015
|
+
if (s !== "speaking" && s !== "thinking") {
|
|
4016
|
+
for (const r of this.idleWaiters.splice(0)) r();
|
|
4017
|
+
}
|
|
4018
|
+
}
|
|
4019
|
+
/** Resolve when the engine is no longer speaking (immediate if already idle). */
|
|
4020
|
+
awaitIdle() {
|
|
4021
|
+
if (this.state !== "speaking" && this.state !== "thinking") return Promise.resolve();
|
|
4022
|
+
return new Promise((r) => this.idleWaiters.push(r));
|
|
4023
|
+
}
|
|
4024
|
+
// --- speaking side (host-driven) ---
|
|
4025
|
+
/** open a spoken turn (idempotent — safe from both onUtterance and first-delta paths).
|
|
4026
|
+
* `ack` speaks the configured micro-ack as the context opener (utterance path only —
|
|
4027
|
+
* masks LLM TTFT; re-voice turns begun by their first delta skip it). */
|
|
4028
|
+
beginSpeech(ack = false) {
|
|
4029
|
+
if (this.speaking && this.ctxOpen) return;
|
|
4030
|
+
if (this.drainTimer) {
|
|
4031
|
+
clearTimeout(this.drainTimer);
|
|
4032
|
+
this.drainTimer = null;
|
|
4033
|
+
}
|
|
4034
|
+
this.interrupted = false;
|
|
4035
|
+
if (!this.speaking) this.player.markTurn();
|
|
4036
|
+
this.speaking = true;
|
|
4037
|
+
this.ctxOpen = true;
|
|
4038
|
+
this.spokeDeltas = false;
|
|
4039
|
+
this.reply = "";
|
|
4040
|
+
this.echoWords = new Set(this.words(this.prevReply));
|
|
4041
|
+
this.tts.newContext();
|
|
4042
|
+
if (ack && this.options.ackPhrase) {
|
|
4043
|
+
this.tts.speak(this.options.ackPhrase + " ", true);
|
|
4044
|
+
this.spokeDeltas = true;
|
|
4045
|
+
this.ackAt = now();
|
|
4046
|
+
}
|
|
4047
|
+
this.setState("thinking");
|
|
4048
|
+
}
|
|
4049
|
+
speakDelta(text) {
|
|
4050
|
+
if (this.interrupted) return;
|
|
4051
|
+
if (!this.speaking || !this.ctxOpen) this.beginSpeech();
|
|
4052
|
+
this.reply += text;
|
|
4053
|
+
for (const w of this.words(this.reply)) this.echoWords.add(w);
|
|
4054
|
+
this.tts.speak(text, true);
|
|
4055
|
+
this.spokeDeltas = true;
|
|
4056
|
+
this.setState("speaking");
|
|
4057
|
+
}
|
|
4058
|
+
/** close the spoken turn (idempotent); stays audible until ALL audio arrived AND playback drains */
|
|
4059
|
+
endSpeech() {
|
|
4060
|
+
this.interrupted = false;
|
|
4061
|
+
if (!this.speaking) return;
|
|
4062
|
+
this.ctxOpen = false;
|
|
4063
|
+
if (this.reply) this.prevReply = this.reply;
|
|
4064
|
+
const settle = () => {
|
|
4065
|
+
if (this.ctxOpen) {
|
|
4066
|
+
this.drainTimer = null;
|
|
4067
|
+
return;
|
|
4068
|
+
}
|
|
4069
|
+
this.drainTimer = null;
|
|
4070
|
+
this.speaking = false;
|
|
4071
|
+
this.echoUntil = now() + 2500;
|
|
4072
|
+
if (!this.usingAec) this.stt.reset();
|
|
4073
|
+
this.setState("listening");
|
|
4074
|
+
};
|
|
4075
|
+
const drainThenSettle = () => {
|
|
4076
|
+
if (this.drainTimer) clearTimeout(this.drainTimer);
|
|
4077
|
+
this.drainTimer = setTimeout(settle, this.player.drainMs() + 300);
|
|
4078
|
+
};
|
|
4079
|
+
if (this.spokeDeltas) {
|
|
4080
|
+
this.tts.onDone = drainThenSettle;
|
|
4081
|
+
this.tts.end();
|
|
4082
|
+
if (this.drainTimer) clearTimeout(this.drainTimer);
|
|
4083
|
+
this.drainTimer = setTimeout(drainThenSettle, 15e3);
|
|
4084
|
+
} else drainThenSettle();
|
|
4085
|
+
}
|
|
4086
|
+
/** text of the reply cut by the last barge-in — consumed by the host to tell the model what
|
|
4087
|
+
* the user did NOT hear. Cleared on read. */
|
|
4088
|
+
takeInterruptedReply() {
|
|
4089
|
+
const r = this.lastInterrupted;
|
|
4090
|
+
this.lastInterrupted = null;
|
|
4091
|
+
return r;
|
|
4092
|
+
}
|
|
4093
|
+
/** barge-in: stop audio NOW, cancel generation, reset for the user's utterance */
|
|
4094
|
+
interrupt() {
|
|
4095
|
+
if (!this.speaking && !this.drainTimer) return;
|
|
4096
|
+
if (this.drainTimer) {
|
|
4097
|
+
clearTimeout(this.drainTimer);
|
|
4098
|
+
this.drainTimer = null;
|
|
4099
|
+
}
|
|
4100
|
+
const heardChars = Math.round(Math.max(0, this.player.playedMs()) / 1e3 * 15);
|
|
4101
|
+
if (this.reply) this.lastInterrupted = { full: this.reply, heard: this.reply.slice(0, heardChars) };
|
|
4102
|
+
this.speaking = false;
|
|
4103
|
+
this.ctxOpen = false;
|
|
4104
|
+
this.interrupted = true;
|
|
4105
|
+
this.suspectUntil = 0;
|
|
4106
|
+
this.echoUntil = now() + 2500;
|
|
4107
|
+
this.tts.cancel();
|
|
4108
|
+
this.player.kill();
|
|
4109
|
+
if (!this.usingAec) this.stt.reset();
|
|
4110
|
+
if (this.reply) this.prevReply = this.reply;
|
|
4111
|
+
this.setState("listening");
|
|
4112
|
+
}
|
|
4113
|
+
stop() {
|
|
4114
|
+
if (this.pendingTimer) clearTimeout(this.pendingTimer);
|
|
4115
|
+
if (this.drainTimer) clearTimeout(this.drainTimer);
|
|
4116
|
+
this.stt.stop();
|
|
4117
|
+
this.player.kill();
|
|
4118
|
+
this.tts.close();
|
|
4119
|
+
this.setState("idle");
|
|
4120
|
+
}
|
|
4121
|
+
// --- listening side (STT-driven) ---
|
|
4122
|
+
words(s) {
|
|
4123
|
+
return s.toLowerCase().replace(/[^a-z0-9\s]/g, "").split(/\s+/).filter((w) => w.length >= 2);
|
|
4124
|
+
}
|
|
4125
|
+
novelWords(text) {
|
|
4126
|
+
return this.words(text).filter((w) => !this.echoWords.has(w));
|
|
4127
|
+
}
|
|
4128
|
+
echoActive() {
|
|
4129
|
+
return this.speaking || now() < this.echoUntil;
|
|
4130
|
+
}
|
|
4131
|
+
handlePartial(text) {
|
|
4132
|
+
if (this.speaking) {
|
|
4133
|
+
const barge = this.novelWords(text).length >= (this.usingAec ? 1 : this.suspectUntil ? 1 : 2);
|
|
4134
|
+
if (barge) {
|
|
4135
|
+
const phase = this.ctxOpen ? "speaking" : "drain";
|
|
4136
|
+
this.interrupt();
|
|
4137
|
+
this.options.onBargeIn(phase);
|
|
4138
|
+
}
|
|
4139
|
+
return;
|
|
4140
|
+
}
|
|
4141
|
+
if (this.pendingUtt && text.trim()) {
|
|
4142
|
+
if (this.pendingTimer) {
|
|
4143
|
+
clearTimeout(this.pendingTimer);
|
|
4144
|
+
this.pendingTimer = null;
|
|
4145
|
+
}
|
|
4146
|
+
}
|
|
4147
|
+
if (!this.echoActive() || this.novelWords(text).length >= 1) this.options.onPartial(text);
|
|
4148
|
+
}
|
|
4149
|
+
handleUtterance(text) {
|
|
4150
|
+
if (this.echoActive() && this.novelWords(text).length < (this.usingAec ? 1 : 2)) {
|
|
4151
|
+
this.stt.reset();
|
|
4152
|
+
return;
|
|
4153
|
+
}
|
|
4154
|
+
const squash = (t) => t.toLowerCase().replace(/[^a-z]/g, "").replace(/(.)\1+/g, "$1");
|
|
4155
|
+
if (this.ackAt && now() - this.ackAt < 6e3 && squash(text) === squash(this.options.ackPhrase)) {
|
|
4156
|
+
this.ackAt = 0;
|
|
4157
|
+
return;
|
|
4158
|
+
}
|
|
4159
|
+
this.pendingUtt = this.pendingUtt ? `${this.pendingUtt} ${text}` : text;
|
|
4160
|
+
if (this.pendingTimer) clearTimeout(this.pendingTimer);
|
|
4161
|
+
if (!this.options.utteranceMergeMs) return this.flushUtterance();
|
|
4162
|
+
this.pendingTimer = setTimeout(() => this.flushUtterance(), this.options.utteranceMergeMs);
|
|
4163
|
+
}
|
|
4164
|
+
flushUtterance() {
|
|
4165
|
+
if (this.pendingTimer) {
|
|
4166
|
+
clearTimeout(this.pendingTimer);
|
|
4167
|
+
this.pendingTimer = null;
|
|
4168
|
+
}
|
|
4169
|
+
const text = this.pendingUtt;
|
|
4170
|
+
this.pendingUtt = "";
|
|
4171
|
+
if (text) this.options.onUtterance(text);
|
|
4172
|
+
}
|
|
4173
|
+
/** energy two-stage barge-in (heuristic tier only): spike over echo baseline → pause + confirm via STT */
|
|
4174
|
+
handleLevel(rms) {
|
|
4175
|
+
if (this.usingAec) return;
|
|
4176
|
+
if (!this.speaking) {
|
|
4177
|
+
this.baseline = 0;
|
|
4178
|
+
this.hot = 0;
|
|
4179
|
+
return;
|
|
4180
|
+
}
|
|
4181
|
+
if (!this.baseline) {
|
|
4182
|
+
this.baseline = rms;
|
|
4183
|
+
return;
|
|
4184
|
+
}
|
|
4185
|
+
this.baseline = this.baseline * 0.9 + rms * 0.1;
|
|
4186
|
+
if (rms > Math.max(this.baseline * this.options.bargeRmsMult, this.options.bargeRmsFloor)) this.hot++;
|
|
4187
|
+
else this.hot = 0;
|
|
4188
|
+
if (this.hot >= 2 && !this.suspectUntil) {
|
|
4189
|
+
this.suspectUntil = now() + 1300;
|
|
4190
|
+
setTimeout(() => {
|
|
4191
|
+
this.suspectUntil = 0;
|
|
4192
|
+
}, 1350);
|
|
4193
|
+
}
|
|
4194
|
+
}
|
|
4195
|
+
};
|
|
4196
|
+
|
|
4197
|
+
// src/voice/soniox.ts
|
|
4198
|
+
init_logging();
|
|
4199
|
+
|
|
4200
|
+
// src/voice/types.ts
|
|
4201
|
+
var STT_SAMPLE_RATE = 16e3;
|
|
4202
|
+
var TTS_SAMPLE_RATE = 44100;
|
|
4203
|
+
async function resolveAuth(auth) {
|
|
4204
|
+
return typeof auth === "function" ? await auth() : auth;
|
|
4205
|
+
}
|
|
4206
|
+
|
|
4207
|
+
// src/voice/soniox.ts
|
|
4208
|
+
var log9 = forComponent("SonioxSTT");
|
|
4209
|
+
var now2 = () => performance.now();
|
|
4210
|
+
var SonioxSTTOptions = class {
|
|
4211
|
+
auth = "";
|
|
4212
|
+
source;
|
|
4213
|
+
model = "stt-rt-preview";
|
|
4214
|
+
languageHints = ["en"];
|
|
4215
|
+
};
|
|
4216
|
+
var SonioxSTT = class {
|
|
4217
|
+
options;
|
|
4218
|
+
ws;
|
|
4219
|
+
stopped = false;
|
|
4220
|
+
sourceStarted = false;
|
|
4221
|
+
onPartial = () => {
|
|
4222
|
+
};
|
|
4223
|
+
onUtterance = () => {
|
|
4224
|
+
};
|
|
4225
|
+
/** mic energy (RMS) per chunk — drives the energy-based heuristic barge-in tier */
|
|
4226
|
+
onLevel = () => {
|
|
4227
|
+
};
|
|
4228
|
+
finalText = "";
|
|
4229
|
+
partialText = "";
|
|
4230
|
+
constructor(options) {
|
|
4231
|
+
this.options = { ...new SonioxSTTOptions(), ...options };
|
|
4232
|
+
}
|
|
4233
|
+
get usingAec() {
|
|
4234
|
+
return this.options.source?.aec ?? false;
|
|
4235
|
+
}
|
|
4236
|
+
async connectWs() {
|
|
4237
|
+
const apiKey = await resolveAuth(this.options.auth);
|
|
4238
|
+
this.ws = new WebSocket("wss://stt-rt.soniox.com/transcribe-websocket");
|
|
4239
|
+
await new Promise((res, rej) => {
|
|
4240
|
+
this.ws.onopen = () => res();
|
|
4241
|
+
this.ws.onerror = (e) => rej(new Error(`soniox ws: ${e.message || "connect failed"}`));
|
|
4242
|
+
});
|
|
4243
|
+
this.ws.send(
|
|
4244
|
+
JSON.stringify({
|
|
4245
|
+
api_key: apiKey,
|
|
4246
|
+
model: this.options.model,
|
|
4247
|
+
audio_format: "pcm_s16le",
|
|
4248
|
+
sample_rate: STT_SAMPLE_RATE,
|
|
4249
|
+
num_channels: 1,
|
|
4250
|
+
language_hints: this.options.languageHints,
|
|
4251
|
+
enable_endpoint_detection: true
|
|
4252
|
+
})
|
|
4253
|
+
);
|
|
4254
|
+
this.ws.onmessage = (ev) => this.handle(JSON.parse(String(ev.data)));
|
|
4255
|
+
this.ws.onclose = (ev) => {
|
|
4256
|
+
if (this.stopped) return;
|
|
4257
|
+
log9.warn(`soniox ws closed (${ev.code} ${ev.reason || ""}) \u2014 reconnecting`);
|
|
4258
|
+
this.reset();
|
|
4259
|
+
this.connectWs().catch((e) => log9.error(`soniox reconnect failed: ${e.message}`));
|
|
4260
|
+
};
|
|
4261
|
+
}
|
|
4262
|
+
async start() {
|
|
4263
|
+
await this.connectWs();
|
|
4264
|
+
if (this.sourceStarted) return;
|
|
4265
|
+
this.sourceStarted = true;
|
|
4266
|
+
await this.options.source.start((chunk) => {
|
|
4267
|
+
let sum = 0;
|
|
4268
|
+
const view = new DataView(chunk.buffer, chunk.byteOffset, chunk.byteLength);
|
|
4269
|
+
for (let i = 0; i + 1 < chunk.byteLength; i += 2) {
|
|
4270
|
+
const v = view.getInt16(i, true);
|
|
4271
|
+
sum += v * v;
|
|
4272
|
+
}
|
|
4273
|
+
this.onLevel(Math.sqrt(sum / (chunk.byteLength / 2)));
|
|
4274
|
+
if (this.ws.readyState === WebSocket.OPEN) this.ws.send(chunk);
|
|
4275
|
+
});
|
|
4276
|
+
}
|
|
4277
|
+
handle(m) {
|
|
4278
|
+
if (m.error_message) return log9.error(`soniox: ${m.error_message}`);
|
|
4279
|
+
let endpoint = false;
|
|
4280
|
+
for (const t of m.tokens ?? []) {
|
|
4281
|
+
if (t.text === "<end>") endpoint = true;
|
|
4282
|
+
else if (t.is_final) this.finalText += t.text;
|
|
4283
|
+
}
|
|
4284
|
+
this.partialText = (m.tokens ?? []).filter((t) => !t.is_final && t.text !== "<end>").map((t) => t.text).join("");
|
|
4285
|
+
this.onPartial(this.finalText + this.partialText);
|
|
4286
|
+
if (endpoint && this.finalText.trim()) {
|
|
4287
|
+
const utterance = this.finalText.trim();
|
|
4288
|
+
this.reset();
|
|
4289
|
+
this.onUtterance(utterance, now2());
|
|
4290
|
+
}
|
|
4291
|
+
}
|
|
4292
|
+
reset() {
|
|
4293
|
+
this.finalText = "";
|
|
4294
|
+
this.partialText = "";
|
|
4295
|
+
}
|
|
4296
|
+
stop() {
|
|
4297
|
+
this.stopped = true;
|
|
4298
|
+
this.options.source?.stop();
|
|
4299
|
+
if (this.ws) this.ws.onclose = null;
|
|
4300
|
+
this.ws?.close();
|
|
4301
|
+
}
|
|
4302
|
+
};
|
|
4303
|
+
|
|
4304
|
+
// src/voice/cartesia.ts
|
|
4305
|
+
init_logging();
|
|
4306
|
+
var log10 = forComponent("CartesiaTTS");
|
|
4307
|
+
var now3 = () => performance.now();
|
|
4308
|
+
var CartesiaTTSOptions = class {
|
|
4309
|
+
auth = "";
|
|
4310
|
+
voiceId = "";
|
|
4311
|
+
model = "sonic-3.5";
|
|
4312
|
+
/** 'apiKey' (server/CLI) → `api_key=` URL param; 'token' (browser, BE-minted) → `access_token=`. */
|
|
4313
|
+
authMode = "apiKey";
|
|
4314
|
+
};
|
|
4315
|
+
var CartesiaTTS = class {
|
|
4316
|
+
options;
|
|
4317
|
+
ws;
|
|
4318
|
+
ctxSeq = 0;
|
|
4319
|
+
ctxId = "";
|
|
4320
|
+
onAudio = () => {
|
|
4321
|
+
};
|
|
4322
|
+
onDone = () => {
|
|
4323
|
+
};
|
|
4324
|
+
firstAudioAt = 0;
|
|
4325
|
+
constructor(options) {
|
|
4326
|
+
this.options = { ...new CartesiaTTSOptions(), ...options };
|
|
4327
|
+
}
|
|
4328
|
+
async connect() {
|
|
4329
|
+
const key = await resolveAuth(this.options.auth);
|
|
4330
|
+
const param = this.options.authMode === "token" ? "access_token" : "api_key";
|
|
4331
|
+
this.ws = new WebSocket(`wss://api.cartesia.ai/tts/websocket?cartesia_version=2026-03-01&${param}=${key}`);
|
|
4332
|
+
await new Promise((res, rej) => {
|
|
4333
|
+
this.ws.onopen = () => res();
|
|
4334
|
+
this.ws.onerror = (e) => rej(new Error(`cartesia ws: ${e.message || "connect failed"}`));
|
|
4335
|
+
});
|
|
4336
|
+
this.ws.onclose = (ev) => log10.warn(`cartesia ws closed (${ev.code} ${ev.reason || ""})`);
|
|
4337
|
+
this.ws.onmessage = (ev) => {
|
|
4338
|
+
const m = JSON.parse(String(ev.data));
|
|
4339
|
+
if (m.context_id && m.context_id !== this.ctxId) return;
|
|
4340
|
+
if (m.type === "chunk" && m.data) {
|
|
4341
|
+
if (!this.firstAudioAt) this.firstAudioAt = now3();
|
|
4342
|
+
this.onAudio(base64ToBytes(m.data));
|
|
4343
|
+
} else if (m.type === "done") this.onDone();
|
|
4344
|
+
else if (m.type === "error" && !/already been cancelled|does not exist/.test(m.message || "")) log10.warn(`cartesia: ${JSON.stringify(m)}`);
|
|
4345
|
+
};
|
|
4346
|
+
}
|
|
4347
|
+
newContext() {
|
|
4348
|
+
this.ctxId = `ctx-${++this.ctxSeq}`;
|
|
4349
|
+
this.firstAudioAt = 0;
|
|
4350
|
+
return this.ctxId;
|
|
4351
|
+
}
|
|
4352
|
+
frame(transcript, cont) {
|
|
4353
|
+
return JSON.stringify({
|
|
4354
|
+
model_id: this.options.model,
|
|
4355
|
+
transcript,
|
|
4356
|
+
voice: { mode: "id", id: this.options.voiceId },
|
|
4357
|
+
output_format: { container: "raw", encoding: "pcm_s16le", sample_rate: TTS_SAMPLE_RATE },
|
|
4358
|
+
context_id: this.ctxId,
|
|
4359
|
+
continue: cont
|
|
4360
|
+
});
|
|
4361
|
+
}
|
|
4362
|
+
speak(text, cont) {
|
|
4363
|
+
if (this.ws?.readyState === WebSocket.OPEN) this.ws.send(this.frame(text, cont));
|
|
4364
|
+
}
|
|
4365
|
+
end() {
|
|
4366
|
+
if (this.ws?.readyState === WebSocket.OPEN) this.ws.send(this.frame("", false));
|
|
4367
|
+
}
|
|
4368
|
+
cancel() {
|
|
4369
|
+
if (this.ws?.readyState === WebSocket.OPEN) this.ws.send(JSON.stringify({ context_id: this.ctxId, cancel: true }));
|
|
4370
|
+
}
|
|
4371
|
+
close() {
|
|
4372
|
+
if (this.ws) this.ws.onclose = null;
|
|
4373
|
+
this.ws?.close();
|
|
4374
|
+
}
|
|
4375
|
+
};
|
|
4376
|
+
function base64ToBytes(b64) {
|
|
4377
|
+
if (typeof Buffer !== "undefined") return Buffer.from(b64, "base64");
|
|
4378
|
+
const bin = atob(b64);
|
|
4379
|
+
const out = new Uint8Array(bin.length);
|
|
4380
|
+
for (let i = 0; i < bin.length; i++) out[i] = bin.charCodeAt(i);
|
|
4381
|
+
return out;
|
|
4382
|
+
}
|
|
4383
|
+
|
|
4384
|
+
// src/index.ts
|
|
3764
4385
|
import { MemFilesystem as MemFilesystem3, IndexedDbFilesystem, CommandExecutor as CommandExecutor2, registerHeadlessCommands as registerHeadlessCommands2 } from "@livx.cc/wcli/core";
|
|
3765
4386
|
export {
|
|
3766
4387
|
Agent,
|
|
3767
4388
|
AgentOptions,
|
|
3768
4389
|
BodDbFilesystem,
|
|
4390
|
+
CartesiaTTS,
|
|
4391
|
+
CartesiaTTSOptions,
|
|
3769
4392
|
CommandExecutor2 as CommandExecutor,
|
|
3770
4393
|
ConsoleHostBridge,
|
|
3771
4394
|
DEFAULT_DENY,
|
|
@@ -3785,9 +4408,15 @@ export {
|
|
|
3785
4408
|
PermissionPolicy,
|
|
3786
4409
|
RecordingHooks,
|
|
3787
4410
|
RecordingLifecycle,
|
|
4411
|
+
STT_SAMPLE_RATE,
|
|
3788
4412
|
SandboxJobRegistry,
|
|
3789
4413
|
ScriptedHostBridge,
|
|
4414
|
+
SonioxSTT,
|
|
4415
|
+
SonioxSTTOptions,
|
|
4416
|
+
TTS_SAMPLE_RATE,
|
|
3790
4417
|
VOICE_SYSTEM_PROMPT,
|
|
4418
|
+
VoiceEngine,
|
|
4419
|
+
VoiceEngineOptions,
|
|
3791
4420
|
applyEditsTool,
|
|
3792
4421
|
askUserQuestionTool,
|
|
3793
4422
|
bashTool,
|
|
@@ -3799,6 +4428,7 @@ export {
|
|
|
3799
4428
|
defaultTools,
|
|
3800
4429
|
diskAgentOptions,
|
|
3801
4430
|
editTool,
|
|
4431
|
+
exitSessionTool,
|
|
3802
4432
|
expandCommand,
|
|
3803
4433
|
expandTemplate,
|
|
3804
4434
|
forComponent,
|
|
@@ -3835,6 +4465,7 @@ export {
|
|
|
3835
4465
|
relevanceScore,
|
|
3836
4466
|
repoIndex,
|
|
3837
4467
|
repoMapTool,
|
|
4468
|
+
resolveAuth,
|
|
3838
4469
|
rollbackTool,
|
|
3839
4470
|
sandboxAgentOptions,
|
|
3840
4471
|
slugify,
|