agent.libx.js 0.92.9 → 0.93.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -1970,34 +1970,61 @@ async function loadCommands(fs, dir, opts = {}) {
1970
1970
  // src/memory.ts
1971
1971
  init_tools_structured();
1972
1972
  var MAX_INDEX = 25;
1973
+ var MAX_INDEX_LINES = 200;
1974
+ var MAX_INDEX_BYTES = 25e3;
1975
+ var MEMORY_PROMPT = '## Memory guidelines\n**When to Remember:** user corrections ("no, do X instead"), preferences ("I prefer Y"), project constraints, recurring gotchas, role/context ("I\'m a data scientist"), workflow patterns confirmed by the user. Capture the WHY, not just the what.\n**When NOT to Remember:** task-specific scratch, transient state, things derivable from code/git, conversation filler, anything already in instruction files. If removing the memory wouldn\'t change future behavior, don\'t save it.\n**Types** (pass to Remember \u2014 this determines where the memory is stored):\n- `user` \u2014 role, preferences, knowledge level, collaboration style (GLOBAL \u2014 follows you across projects)\n- `feedback` \u2014 corrections, what to avoid/repeat, with WHY (GLOBAL)\n- `project` \u2014 ongoing work, goals, constraints, deadlines (LOCAL to this project; convert relative dates to absolute)\n- `reference` \u2014 pointers to external resources, URLs, vendor docs (LOCAL to this project)\n**Before acting on a recalled memory:** verify it\'s still true \u2014 a memory naming a file/function/flag is a claim from when it was written. Grep/read to confirm before recommending. Stale memory \u2192 update via Remember (same slug overwrites).\n**Dedup:** before writing, check if a similar memory exists (Recall or MemorySearch). Update the existing one rather than creating duplicates.\nCall `Recall` with a slug (or multiple slugs/a pattern) to load full bodies when relevant.\nCall `MemorySearch` with a query to find memories by content when you don\'t know the slug.\nCall `Remember` to persist a durable fact for future sessions.';
1976
+ var VOICE_MEMORY_PROMPT = `You have Remember and Recall tools \u2014 use them directly, no delegation needed.
1977
+ IMPLICIT CAPTURE: when the user shares their name, role, a preference, a correction ("no, do X instead"), or a project constraint \u2014 call Remember immediately without announcing it. Natural memory, not a ceremony.
1978
+ For explicit "remember X" requests, also call Remember directly and confirm briefly ("got it").
1979
+ Do NOT remember: transient task details, conversation filler, things you'd forget in a real conversation.
1980
+ Keep it invisible: never announce "saving to memory" or list what you remembered unless asked.
1981
+ For anything requiring files, shell, or web \u2014 still Delegate.`;
1973
1982
  async function loadMemory(fs, dir, opts = {}) {
1974
- const indexPath = `${dir}/MEMORY.md`;
1975
- const tools = [recallTool(fs, dir), rememberTool(fs, dir), memorySearchTool(fs, dir)];
1976
- const md = await fs.exists(indexPath) ? (await fs.readFile(indexPath)).trim() : "";
1977
- if (!md) return { index: "", tools };
1978
- const lines = md.split("\n");
1979
- const pointers = lines.filter((l) => /^\s*-\s*\[.+\]\(.+\.md\)/.test(l));
1980
- const header = lines.filter((l) => !/^\s*-\s*\[.+\]\(.+\.md\)/.test(l)).join("\n").trim();
1981
- const { kept, rest } = topByRelevance(pointers, opts.relevanceHint ?? "", (l) => l, opts.max ?? MAX_INDEX);
1983
+ const dirs = (Array.isArray(dir) ? dir : [dir]).filter(Boolean);
1984
+ const writeDir = dirs[0];
1985
+ const tools = [recallTool(fs, dirs), rememberTool(fs, writeDir, opts), memorySearchTool(fs, dirs)];
1986
+ const allPointers = [];
1987
+ const seenSlugs = /* @__PURE__ */ new Set();
1988
+ let header = "";
1989
+ let hasContent = false;
1990
+ for (const d of dirs) {
1991
+ const indexPath = `${d}/MEMORY.md`;
1992
+ const md = await fs.exists(indexPath) ? (await fs.readFile(indexPath)).trim() : "";
1993
+ if (!md) continue;
1994
+ hasContent = true;
1995
+ const lines = md.split("\n");
1996
+ if (!header) header = lines.filter((l) => !/^\s*-\s*\[.+\]\(.+\.md\)/.test(l)).join("\n").trim();
1997
+ for (const l of lines.filter((l2) => /^\s*-\s*\[.+\]\(.+\.md\)/.test(l2))) {
1998
+ const slug = l.match(/\]\(([^)]+)\.md\)/)?.[1];
1999
+ if (slug && !seenSlugs.has(slug)) {
2000
+ seenSlugs.add(slug);
2001
+ allPointers.push(l);
2002
+ }
2003
+ }
2004
+ }
2005
+ if (!hasContent) return { index: MEMORY_PROMPT, tools };
2006
+ const { kept, rest } = topByRelevance(allPointers, opts.relevanceHint ?? "", (l) => l, opts.max ?? MAX_INDEX);
1982
2007
  const restSlugs = rest.map((l) => l.match(/\]\(([^)]+)\.md\)/)?.[1] ?? l.match(/\[([^\]]+)\]/)?.[1] ?? "").filter(Boolean);
1983
- const index = "## Memory (persistent context \u2014 recalled across sessions)\n" + (header ? header + "\n" : "") + kept.join("\n") + (restSlugs.length ? `
1984
- - (${restSlugs.length} more learnings, slugs only \u2014 call \`Recall\` if relevant): ${restSlugs.join(", ")}` : "") + `
1985
-
1986
- These are pointers only. Call \`Recall\` with a slug (or multiple slugs/a pattern) to load full bodies when relevant.
1987
- Call \`MemorySearch\` with a query to find memories by content when you don't know the slug.
1988
- Call \`Remember\` to persist a durable fact for future sessions.`;
2008
+ const index = MEMORY_PROMPT + "\n\n## Memory index (persistent context \u2014 recalled across sessions)\n" + (header ? header + "\n" : "") + kept.join("\n") + (restSlugs.length ? `
2009
+ - (${restSlugs.length} more learnings, slugs only \u2014 call \`Recall\` if relevant): ${restSlugs.join(", ")}` : "");
1989
2010
  return { index, tools };
1990
2011
  }
1991
2012
  function slugify(s, fallback = "note") {
1992
2013
  const base = String(s ?? "").trim().toLowerCase().replace(/\.md$/i, "").replace(/[^\w\s-]/g, "").replace(/[\s_]+/g, "-").replace(/-+/g, "-").replace(/^-|-$/g, "").slice(0, 48);
1993
2014
  return base || fallback;
1994
2015
  }
1995
- async function writeFact(fs, dir, slug, body) {
2016
+ async function writeFact(fs, dir, slug, body, opts) {
1996
2017
  await mkdirp(fs, dir);
1997
- await fs.writeFile(`${dir}/${slug}.md`, body.endsWith("\n") ? body : body + "\n");
2018
+ const content = opts?.type ? `---
2019
+ type: ${opts.type}
2020
+ ---
2021
+
2022
+ ${body}` : body;
2023
+ await fs.writeFile(`${dir}/${slug}.md`, content.endsWith("\n") ? content : content + "\n");
1998
2024
  const indexPath = `${dir}/MEMORY.md`;
1999
2025
  const idx = await fs.exists(indexPath) ? await fs.readFile(indexPath) : "# Memory Index\n";
2000
- const line = `- [${slug}](${slug}.md) \u2014 ${body.split("\n")[0].slice(0, 80)}`;
2026
+ const summary = opts?.description || body.split("\n")[0].slice(0, 80);
2027
+ const line = `- [${slug}](${slug}.md) \u2014 ${summary}`;
2001
2028
  const lines = idx.split("\n");
2002
2029
  const at = lines.findIndex((l) => l.includes(`(${slug}.md)`));
2003
2030
  if (at >= 0) {
@@ -2010,6 +2037,24 @@ async function writeFact(fs, dir, slug, body) {
2010
2037
  ${line}
2011
2038
  `);
2012
2039
  }
2040
+ await truncateIndex(fs, indexPath);
2041
+ }
2042
+ async function truncateIndex(fs, path) {
2043
+ const raw = await fs.readFile(path);
2044
+ if (raw.length <= MAX_INDEX_BYTES) {
2045
+ const count = raw.split("\n").filter((l) => /^\s*-\s*\[.+\]\(.+\.md\)/.test(l)).length;
2046
+ if (count <= MAX_INDEX_LINES) return;
2047
+ }
2048
+ const lines = raw.split("\n");
2049
+ const pointerIdxs = [];
2050
+ for (let i = 0; i < lines.length; i++) if (/^\s*-\s*\[.+\]\(.+\.md\)/.test(lines[i])) pointerIdxs.push(i);
2051
+ const drop = /* @__PURE__ */ new Set();
2052
+ for (const pi of pointerIdxs) {
2053
+ const candidate = lines.filter((_, i) => !drop.has(i)).join("\n");
2054
+ if (candidate.length <= MAX_INDEX_BYTES && pointerIdxs.length - drop.size <= MAX_INDEX_LINES) break;
2055
+ drop.add(pi);
2056
+ }
2057
+ if (drop.size) await fs.writeFile(path, lines.filter((_, i) => !drop.has(i)).join("\n"));
2013
2058
  }
2014
2059
  function cleanSlug(raw) {
2015
2060
  let s = String(raw ?? "").trim().replace(/\.md$/i, "").replace(/\\/g, "/").replace(/\/+/g, "/").replace(/^\/|\/$/g, "");
@@ -2035,12 +2080,30 @@ async function listSlugs(fs, dir, prefix = "") {
2035
2080
  async function loadFact(fs, dir, slug) {
2036
2081
  const path = `${dir}/${slug}.md`;
2037
2082
  try {
2038
- return await fs.readFile(path);
2083
+ const raw = await fs.readFile(path);
2084
+ const { body } = splitFrontmatter(raw);
2085
+ return body || raw;
2039
2086
  } catch {
2040
2087
  return null;
2041
2088
  }
2042
2089
  }
2043
- function recallTool(fs, dir) {
2090
+ async function loadFactMulti(fs, dirs, slug) {
2091
+ for (const d of dirs) {
2092
+ const r = await loadFact(fs, d, slug);
2093
+ if (r != null) return r;
2094
+ }
2095
+ return null;
2096
+ }
2097
+ async function listSlugsMulti(fs, dirs) {
2098
+ const seen = /* @__PURE__ */ new Set();
2099
+ const out = [];
2100
+ for (const d of dirs) for (const s of await listSlugs(fs, d)) if (!seen.has(s)) {
2101
+ seen.add(s);
2102
+ out.push(s);
2103
+ }
2104
+ return out;
2105
+ }
2106
+ function recallTool(fs, dirs) {
2044
2107
  return {
2045
2108
  name: "Recall",
2046
2109
  description: 'Load memory facts by slug. Pass `slug` for one, `slugs` for several, or `pattern` (glob like "auth*") to match. Returns full bodies, separated by `--- slug ---` headers.',
@@ -2059,12 +2122,12 @@ function recallTool(fs, dir) {
2059
2122
  else if (pattern) {
2060
2123
  const escaped = String(pattern).replace(/[.+^${}()|[\]\\]/g, "\\$&");
2061
2124
  const re = new RegExp("^" + escaped.replace(/\*/g, ".*").replace(/\?/g, ".") + "$", "i");
2062
- targets = (await listSlugs(ctx.fs, dir)).filter((s) => re.test(s));
2125
+ targets = (await listSlugsMulti(fs, dirs)).filter((s) => re.test(s));
2063
2126
  }
2064
2127
  if (!targets.length) return `Error: no slugs resolved. Pass slug, slugs, or pattern.`;
2065
2128
  const parts = [];
2066
2129
  for (const s of targets.slice(0, 20)) {
2067
- const body = await loadFact(ctx.fs, dir, s);
2130
+ const body = await loadFactMulti(fs, dirs, s);
2068
2131
  if (body != null) parts.push(targets.length > 1 ? `--- ${s} ---
2069
2132
  ${body}` : body);
2070
2133
  else parts.push(targets.length > 1 ? `--- ${s} ---
@@ -2075,7 +2138,7 @@ ${body}` : body);
2075
2138
  }
2076
2139
  };
2077
2140
  }
2078
- function memorySearchTool(fs, dir) {
2141
+ function memorySearchTool(fs, dirs) {
2079
2142
  return {
2080
2143
  name: "MemorySearch",
2081
2144
  description: "Search memory facts by content \u2014 find relevant memories when you don't know the exact slug. Returns up to 10 matching slug + snippet pairs, ranked by relevance. Use `regex: true` for regex patterns.",
@@ -2090,7 +2153,7 @@ function memorySearchTool(fs, dir) {
2090
2153
  async run({ query, regex }, ctx) {
2091
2154
  const q = String(query ?? "").trim();
2092
2155
  if (!q) return "Error: empty query.";
2093
- const slugs = await listSlugs(ctx.fs, dir);
2156
+ const slugs = await listSlugsMulti(fs, dirs);
2094
2157
  if (!slugs.length) return "(no memory facts found)";
2095
2158
  let matcher;
2096
2159
  if (regex) {
@@ -2106,7 +2169,7 @@ function memorySearchTool(fs, dir) {
2106
2169
  }
2107
2170
  const loaded = [];
2108
2171
  for (const slug of slugs) {
2109
- const body = await loadFact(ctx.fs, dir, slug);
2172
+ const body = await loadFactMulti(fs, dirs, slug);
2110
2173
  if (body) loaded.push({ slug, body });
2111
2174
  }
2112
2175
  const idf = idfWeights(loaded.map((l) => l.body));
@@ -2125,7 +2188,9 @@ function memorySearchTool(fs, dir) {
2125
2188
  }
2126
2189
  };
2127
2190
  }
2128
- function rememberTool(fs, dir) {
2191
+ function rememberTool(fs, dir, memOpts = {}) {
2192
+ const maxWrites = memOpts.maxWritesPerSession ?? 25;
2193
+ let writes = 0;
2129
2194
  return {
2130
2195
  name: "Remember",
2131
2196
  description: "Persist a durable fact for future sessions (a fix you found, a gotcha, a project constraint). Adds a pointer to the Memory index and stores the body. Use sparingly \u2014 only genuinely reusable knowledge, not task-specific scratch.",
@@ -2134,14 +2199,20 @@ function rememberTool(fs, dir) {
2134
2199
  required: ["fact"],
2135
2200
  properties: {
2136
2201
  fact: { type: "string", description: "the durable fact to remember (one or more lines)" },
2137
- slug: { type: "string", description: "optional kebab-case id; derived from the fact if omitted" }
2202
+ slug: { type: "string", description: "optional kebab-case id; derived from the fact if omitted" },
2203
+ type: { type: "string", enum: ["user", "feedback", "project", "reference"], description: "memory category (user/feedback/project/reference)" },
2204
+ description: { type: "string", description: "one-line summary for the memory index (\u226480 chars)" }
2138
2205
  }
2139
2206
  },
2140
- async run({ fact, slug }, ctx) {
2207
+ async run({ fact, slug, type, description }, ctx) {
2141
2208
  const body = String(fact ?? "").trim();
2142
2209
  if (!body) return `Error: nothing to remember (empty fact).`;
2210
+ if (++writes > maxWrites) return `Rate limit: too many memories this session (${maxWrites}). Only persist genuinely durable facts.`;
2143
2211
  const name = slugify(slug || body.split("\n")[0]);
2144
- await writeFact(ctx.fs, dir, name, body);
2212
+ const isGlobal = (type === "user" || type === "feedback") && memOpts.userDir;
2213
+ const targetDir = isGlobal ? memOpts.userDir : dir;
2214
+ await writeFact(fs, targetDir, name, body, { type, description });
2215
+ memOpts.onMemorySaved?.(name, type);
2145
2216
  return `Remembered '${name}' (recallable in future sessions).`;
2146
2217
  }
2147
2218
  };
@@ -2575,8 +2646,11 @@ var AgentOptions = class {
2575
2646
  skillsDir;
2576
2647
  /** VFS dir(s) of slash-command templates (`<dir>/<name>.md`). If set: inject a catalog + add the `SlashCommand` tool. Multiple dirs are merged (first wins). */
2577
2648
  commandsDir;
2578
- /** VFS dir of memory (`<dir>/MEMORY.md`). If set: inject the index at run start (persistence = backend). */
2649
+ /** VFS dir(s) of memory (`<dir>/MEMORY.md`). If set: inject the index at run start (persistence = backend).
2650
+ * Multiple dirs are merged (reads search all; writes go to first). */
2579
2651
  memoryDir;
2652
+ /** User-scope memory dir for global facts (type=user/feedback). Remember routes by type when set. */
2653
+ memoryUserDir;
2580
2654
  /** Filenames to discover as project instructions (e.g. `AGENT.md`, `AGENTS.md`, `CLAUDE.md`).
2581
2655
  * Walks the VFS tree and merges all found files (general → specific, like Claude Code).
2582
2656
  * `true` (default) = auto-discover standard names. `string[]` = custom names. `false` = skip. */
@@ -2613,6 +2687,8 @@ var AgentOptions = class {
2613
2687
  autoTest;
2614
2688
  /** Provider-specific options forwarded to ai.chat() (e.g. cursor mcpServers, cwd). */
2615
2689
  providerOptions;
2690
+ /** Tool selection mode: 'auto' = model decides (needed for Groq); undefined = provider default. */
2691
+ toolChoice;
2616
2692
  /** Extended-thinking / reasoning effort, normalized across providers (anthropic, openai).
2617
2693
  * `'off'`/undefined = none; `'low'|'medium'|'high'` or a raw token budget. Mapped to the
2618
2694
  * provider-specific request shape via {@link reasoningToChatFragment}; explicit `providerOptions` wins. */
@@ -2699,7 +2775,7 @@ var Agent = class _Agent {
2699
2775
  if (ins) systemPrompt += "\n\n" + ins;
2700
2776
  }
2701
2777
  if (o.memoryDir) {
2702
- const { index, tools: memTools } = await loadMemory(fs, o.memoryDir, { relevanceHint: taskHint });
2778
+ const { index, tools: memTools } = await loadMemory(fs, o.memoryDir, { relevanceHint: taskHint, userDir: o.memoryUserDir });
2703
2779
  if (index) systemPrompt += "\n\n" + index;
2704
2780
  tools = [...tools, ...memTools];
2705
2781
  }
@@ -2826,10 +2902,10 @@ var Agent = class _Agent {
2826
2902
  };
2827
2903
  try {
2828
2904
  if (useStream) {
2829
- const r = await o.ai.chat({ model: o.model, messages: sent, tools: wireTools, stream: true, signal: o.signal, ...reasonOpts });
2905
+ const r = await o.ai.chat({ model: o.model, messages: sent, tools: wireTools, stream: true, signal: o.signal, ...o.toolChoice ? { toolChoice: o.toolChoice } : {}, ...reasonOpts });
2830
2906
  res = await this.consumeStream(r);
2831
2907
  } else {
2832
- const r = await o.ai.chat({ model: o.model, messages: sent, tools: wireTools, stream: false, signal: o.signal, ...reasonOpts });
2908
+ const r = await o.ai.chat({ model: o.model, messages: sent, tools: wireTools, stream: false, signal: o.signal, ...o.toolChoice ? { toolChoice: o.toolChoice } : {}, ...reasonOpts });
2833
2909
  res = r;
2834
2910
  }
2835
2911
  } catch (err) {
@@ -3554,8 +3630,13 @@ var DuplexAgentOptions = class {
3554
3630
  /** Host overrides for QuickLook lookups (keyed by `what`). The engine's defaults go through the
3555
3631
  * (possibly jailed) fs — e.g. `.git/**` is deny-listed, so the CLI supplies 'branch' itself. */
3556
3632
  quickLook;
3633
+ /** Memory directory/directories on the WORKER fs. If set, the voice agent gets Remember + Recall
3634
+ * tools directly (no delegation needed) and implicit capture guidance. */
3635
+ memoryDir;
3636
+ /** User-scope memory dir for global facts (type=user/feedback). Forwarded to Remember's routing. */
3637
+ memoryUserDir;
3557
3638
  };
3558
- var VOICE_SYSTEM_PROMPT = 'You are a spoken voice assistant \u2014 the user HEARS everything you say. Use short sentences. One idea per sentence. No markdown, no bullet lists, no code blocks, no headings, no emoji.\nKeep turns SHORT \u2014 one to three sentences, then stop. Never lecture, enumerate cases, or add caveats unprompted. Conversation is a fast exchange: give the one thing asked, and let the user pull more if they want it.\nYou work in a pair: you talk, and a background worker with FULL access to the user\'s environment (files, shell, web) does the hands-on work. You can find out or do ANYTHING by calling `Delegate` with a clear, self-contained brief \u2014 so NEVER tell the user you can\'t see, access, or do something. Delegate and find out. When the user mentions their project, folder, files, or environment ("this project", "the current folder", "my code"), delegate IMMEDIATELY \u2014 do not ask for paths or details the worker can discover itself. Never pretend to have done the work or invent results \u2014 the worker\'s report is your only source.\nAfter calling Delegate, tell the user you are on it in one short sentence, then end your turn. Do not wait for the result.\nResults arrive later as events like "[task t1 completed] \u2026" or "[task t1 failed] \u2026". When one arrives, summarize it for the ear in one or two short sentences. "[task t1 progress] \u2026" events are interim status, NOT results \u2014 give at most a half-sentence aside ("still on it \u2014 running tests now") and end your turn. Never present progress as a finished result.\nNever read raw file paths, diffs, or code aloud verbatim.\n"[task t1 asks] \u2026" events are QUESTIONS from a background task \u2014 relay to the user in your own words, short, then end your turn. When the user answers, call `AnswerTask` with that id and their answer. NEVER answer on the user\'s behalf for permissions or risky operations; if their reply is ambiguous, confirm first.\nDo not fire a second Delegate for work already in flight \u2014 check `TaskStatus` first. Use `CancelTask` when the user asks to stop something.\nPRIORITY: when the user says goodbye or wants to end/finish/wrap up the session ("ok bye", "that\'s all", "let\'s finish", "let\'s end", "goodnight", "exit", "wrap up"), call `ExitSession` IMMEDIATELY \u2014 do not delegate, do not check status, just exit.\nFor TRIVIAL instant lookups only \u2014 current time, git branch, listing a folder, peeking at a small file \u2014 use `QuickLook` (instant, no task). Anything requiring searching, reasoning, running commands, or editing still goes through `Delegate`.\nNEVER claim to have stored, saved, or remembered something durably \u2014 you cannot. Anything the user wants persisted (their name, preferences, notes) must be Delegated so a worker writes it to memory.\nUser messages may arrive via speech-to-text and can carry transcription artifacts \u2014 odd words, cut-offs, homophones ("for you" vs "folder"). Read for INTENT, not surface text. If a message seems garbled or surprising, briefly confirm what they meant ("did you mean\u2026?") instead of answering the literal words.';
3639
+ var VOICE_SYSTEM_PROMPT = 'You are a spoken voice assistant \u2014 the user HEARS everything you say. Use short sentences. One idea per sentence. No markdown, no bullet lists, no code blocks, no headings, no emoji.\nKeep turns SHORT \u2014 one to three sentences, then stop. Never lecture, enumerate cases, or add caveats unprompted. Conversation is a fast exchange: give the one thing asked, and let the user pull more if they want it.\nYou work in a pair: you talk, and a background worker with FULL access to the user\'s environment (files, shell, web) does the hands-on work. You can find out or do ANYTHING by calling `Delegate` with a clear, self-contained brief \u2014 so NEVER tell the user you can\'t see, access, or do something. Delegate and find out. When the user mentions their project, folder, files, or environment ("this project", "the current folder", "my code"), delegate IMMEDIATELY \u2014 do not ask for paths or details the worker can discover itself. Never pretend to have done the work or invent results \u2014 the worker\'s report is your only source.\nAfter calling Delegate, tell the user you are on it in one short sentence, then end your turn. Do not wait for the result.\nResults arrive later as events like "[task t1 completed] \u2026" or "[task t1 failed] \u2026". When one arrives, summarize it for the ear in one or two short sentences. "[task t1 progress] \u2026" events are interim status, NOT results \u2014 give at most a half-sentence aside ("still on it \u2014 running tests now") and end your turn. Never present progress as a finished result.\nNever read raw file paths, diffs, or code aloud verbatim.\n"[task t1 asks] \u2026" events are QUESTIONS from a background task \u2014 relay to the user in your own words, short, then end your turn. When the user answers, call `AnswerTask` with that id and their answer. NEVER answer on the user\'s behalf for permissions or risky operations; if their reply is ambiguous, confirm first.\nDo not fire a second Delegate for work already in flight \u2014 check `TaskStatus` first. Use `CancelTask` when the user asks to stop something.\nPRIORITY: when the user says goodbye or wants to end/finish/wrap up the session ("ok bye", "that\'s all", "let\'s finish", "let\'s end", "goodnight", "exit", "wrap up"), call `ExitSession` IMMEDIATELY \u2014 do not delegate, do not check status, just exit.\nFor TRIVIAL instant lookups only \u2014 current time, git branch, listing a folder, peeking at a small file \u2014 use `QuickLook` (instant, no task). Anything requiring searching, reasoning, running commands, or editing still goes through `Delegate`.\n{{MEMORY_SLOT}}\nUser messages may arrive via speech-to-text and can carry transcription artifacts \u2014 odd words, cut-offs, homophones ("for you" vs "folder"). Read for INTENT, not surface text. If a message seems garbled or surprising, briefly confirm what they meant ("did you mean\u2026?") instead of answering the literal words.';
3559
3640
  var VOICE_STYLE_CONVERSATIONAL = `Speak like a person in a live conversation, not an assistant reading a script. React first, then deliver: a quick impulsive beat ("oh nice", "hmm, hold on", "ah, got it") before the substance. Use contractions always. Vary sentence length \u2014 some very short. Light fillers and backchannels are fine ("mm-hm", "right", "let's see") but at most one per reply \u2014 never stack them. When you delegate, say it like a human would ("hang on, let me actually dig into that \u2014 gimme a minute") instead of announcing a task. When a result comes back, react to it like you just found out ("okay so \u2014 turns out\u2026"). Match the user's energy: a quick question gets a quick answer \u2014 a few words is a perfectly good turn. Prefer a short answer plus an offer ("want the details?") over covering everything. Never narrate your own mechanics (no "I will now delegate", no task ids out loud).`;
3560
3641
  var DuplexAgent = class {
3561
3642
  options;
@@ -3567,34 +3648,45 @@ var DuplexAgent = class {
3567
3648
  flushQueued = false;
3568
3649
  /** Parked worker questions awaiting a (voice-relayed) user answer, keyed by ask id. */
3569
3650
  pendingAsks = /* @__PURE__ */ new Map();
3651
+ /** Lazily resolved memory tools (async loadMemory runs in initMemory). */
3652
+ memoryReady;
3570
3653
  constructor(options) {
3571
3654
  this.options = { ...new DuplexAgentOptions(), ...options };
3572
3655
  const o = this.options;
3656
+ if (o.memoryDir && o.fs) {
3657
+ this.memoryReady = loadMemory(o.fs, o.memoryDir, { maxWritesPerSession: 10, userDir: o.memoryUserDir });
3658
+ }
3659
+ const memSlot = o.memoryDir && o.fs ? VOICE_MEMORY_PROMPT : "NEVER claim to have stored, saved, or remembered something durably \u2014 you cannot. Anything the user wants persisted (their name, preferences, notes) must be Delegated so a worker writes it to memory.";
3660
+ const prompt = VOICE_SYSTEM_PROMPT.replace("{{MEMORY_SLOT}}", memSlot) + (o.voiceStyle === "conversational" ? "\n" + VOICE_STYLE_CONVERSATIONAL : "") + `
3661
+ Today's date: ${(/* @__PURE__ */ new Date()).toDateString()}.`;
3573
3662
  this.voice = new Agent({
3574
3663
  ai: o.ai,
3575
3664
  fs: new MemFilesystem2(),
3576
- // scratch — NOT Agent's jailed-disk default (voice has no fs tools; edge-safe)
3577
3665
  model: o.voiceModel,
3578
3666
  stream: true,
3579
3667
  host: o.host,
3580
- // Runtime context line: without it the voice confidently invents "facts" like today's date
3581
- // (its training cutoff) instead of delegating or admitting it doesn't know.
3582
- systemPrompt: VOICE_SYSTEM_PROMPT + (o.voiceStyle === "conversational" ? "\n" + VOICE_STYLE_CONVERSATIONAL : "") + `
3583
- Today's date: ${(/* @__PURE__ */ new Date()).toDateString()}.`,
3668
+ systemPrompt: prompt,
3584
3669
  instructionFiles: false,
3585
3670
  maxSteps: 8,
3586
- // a voice turn should never loop
3587
3671
  timeoutMs: 3e4,
3588
3672
  ...o.voiceOptions,
3589
- // no defaultTools() — the voice can only Delegate, never touch files itself. Set AFTER the
3590
- // voiceOptions spread (addTools() would be clobbered by the first prepare()); extra voice
3591
- // tools come in via voiceOptions.tools and are merged here.
3592
3673
  tools: [...o.voiceOptions?.tools ?? [], this.delegateTool(), this.taskStatusTool(), this.cancelTaskTool(), this.quickLookTool(), this.answerTaskTool()]
3593
3674
  });
3594
3675
  }
3676
+ /** Resolve memory tools + inject index into voice system prompt (once). */
3677
+ async initMemory() {
3678
+ if (!this.memoryReady) return;
3679
+ const mem = await this.memoryReady;
3680
+ this.memoryReady = void 0;
3681
+ this.voice.options.tools.push(...mem.tools);
3682
+ if (mem.index) this.voice.options.systemPrompt += "\n\n" + mem.index;
3683
+ }
3595
3684
  /** One user turn: the voice agent streams the reply (and may Delegate). Serialized with re-voice turns. */
3596
3685
  send(content) {
3597
- return this.enqueue(() => this.voice.send(content));
3686
+ return this.enqueue(async () => {
3687
+ await this.initMemory();
3688
+ return this.voice.send(content);
3689
+ });
3598
3690
  }
3599
3691
  /** Resolve when all queued voice turns AND all in-flight worker tasks have settled (tests, graceful shutdown). */
3600
3692
  async idle() {
@@ -4062,18 +4154,15 @@ var VoiceEngineOptions = class {
4062
4154
  /** heuristic (non-AEC) energy barge-in tuning */
4063
4155
  bargeRmsMult = 2;
4064
4156
  bargeRmsFloor = 500;
4065
- /** Overlap turn-taking (AEC tier, needs player.pause/resume) — human phone-call model:
4066
- * onset PAUSE (exact-sample hold, nothing lost); sustained overlap cede (interrupt; the LLM
4067
- * re-enters). Brief overlaps that die out (backchannels"mm-hm", decided by DURATION, not
4068
- * vocabulary) resume from the precise sample and are dropped. false disables. */
4157
+ /** Overlap turn-taking (AEC tier, needs player.pause/resume) — human phone-call model, driven by
4158
+ * the STT ITSELF (a trained speech classifier) instead of energy thresholds (energy could not
4159
+ * separate residue bursts from speech in every room hiccup whack-a-mole): partial text while
4160
+ * speaking PAUSE (exact-sample hold); partial grows into dominant-novel ≥2 words → cede
4161
+ * (interrupt; the LLM re-enters); partial stalls/endpoints without ceding (backchannel by
4162
+ * DURATION, not vocabulary) → resume + drop. false disables. */
4069
4163
  overlapPause = true;
4070
- /** sustained overlap this → cede the turn */
4071
- overlapSustainMs = 450;
4072
- /** quiet for this long while paused → resume, drop the interjection */
4164
+ /** no new partial activity for this long while paused resume, drop the interjection */
4073
4165
  overlapResumeMs = 700;
4074
- /** energy floor for "overlap candidate" — must sit ABOVE typical room ambient (~110 rms measured;
4075
- * ungated ambient re-arming the resume timer forever was a live wedge). User speech ≫ 300. */
4076
- overlapRms = 300;
4077
4166
  };
4078
4167
  var VoiceEngine = class {
4079
4168
  options;
@@ -4106,12 +4195,8 @@ var VoiceEngine = class {
4106
4195
  lastInterrupted = null;
4107
4196
  // overlap (pause) tier state — AEC + pause-capable sinks only
4108
4197
  pausedAt = 0;
4109
- overlapLoud = 0;
4110
- // loud chunks since pause (sustain must be real sound, not two clicks)
4111
- overlapLastLoudAt = 0;
4112
- // continuity guard: a gap re-arms the onset (sparse noise ≠ sustained speech)
4113
- loudTimes = [];
4114
- // recent loud-chunk timestamps (sliding onset window)
4198
+ lastOverlapPartial = "";
4199
+ // change-detection: only NEW partial text counts as activity
4115
4200
  resumeTimer = null;
4116
4201
  constructor(options) {
4117
4202
  this.options = { ...new VoiceEngineOptions(), ...options };
@@ -4275,7 +4360,24 @@ var VoiceEngine = class {
4275
4360
  }
4276
4361
  handlePartial(text) {
4277
4362
  if (this.speaking) {
4278
- const barge = this.overlapCapable ? false : this.usingAec ? this.genuine(text) : this.novelWords(text).length >= (this.suspectUntil ? 1 : 2);
4363
+ if (this.overlapCapable) {
4364
+ const txt = text.trim();
4365
+ if (!txt || txt === this.lastOverlapPartial) return;
4366
+ this.lastOverlapPartial = txt;
4367
+ if (!this.pausedAt) {
4368
+ this.pausedAt = now();
4369
+ this.player.pause();
4370
+ }
4371
+ if (this.genuine(txt) && this.words(txt).length >= 2) {
4372
+ const phase = this.ctxOpen ? "speaking" : "drain";
4373
+ this.interrupt();
4374
+ this.options.onBargeIn(phase);
4375
+ return;
4376
+ }
4377
+ this.armResume();
4378
+ return;
4379
+ }
4380
+ const barge = this.usingAec ? this.genuine(text) : this.novelWords(text).length >= (this.suspectUntil ? 1 : 2);
4279
4381
  if (barge) {
4280
4382
  const phase = this.ctxOpen ? "speaking" : "drain";
4281
4383
  this.interrupt();
@@ -4284,15 +4386,13 @@ var VoiceEngine = class {
4284
4386
  return;
4285
4387
  }
4286
4388
  if (this.pendingUtt && text.trim()) {
4287
- if (this.pendingTimer) {
4288
- clearTimeout(this.pendingTimer);
4289
- this.pendingTimer = null;
4290
- }
4389
+ if (this.pendingTimer) clearTimeout(this.pendingTimer);
4390
+ this.pendingTimer = setTimeout(() => this.flushUtterance(), Math.max(800, this.options.utteranceMergeMs));
4291
4391
  }
4292
4392
  if (!this.echoActive() || (this.usingAec ? this.genuine(text) : this.novelWords(text).length >= 1)) this.options.onPartial(text);
4293
4393
  }
4294
4394
  handleUtterance(text) {
4295
- if (this.speaking && this.ctxOpen && this.overlapCapable) {
4395
+ if (this.speaking && (this.ctxOpen || this.pausedAt) && this.overlapCapable) {
4296
4396
  this.stt.reset();
4297
4397
  return;
4298
4398
  }
@@ -4307,7 +4407,7 @@ var VoiceEngine = class {
4307
4407
  }
4308
4408
  this.pendingUtt = this.pendingUtt ? `${this.pendingUtt} ${text}` : text;
4309
4409
  if (this.pendingTimer) clearTimeout(this.pendingTimer);
4310
- if (!this.options.utteranceMergeMs) return this.flushUtterance();
4410
+ if (!this.options.utteranceMergeMs || this.words(this.pendingUtt).length >= 4) return this.flushUtterance();
4311
4411
  this.pendingTimer = setTimeout(() => this.flushUtterance(), this.options.utteranceMergeMs);
4312
4412
  }
4313
4413
  flushUtterance() {
@@ -4322,48 +4422,12 @@ var VoiceEngine = class {
4322
4422
  get overlapCapable() {
4323
4423
  return this.usingAec && this.options.overlapPause && !!this.player.pause && !!this.player.resume;
4324
4424
  }
4325
- /** Overlap turn-taking (AEC tier): onset → pause (exact-sample hold); sustained → cede; died out
4326
- * → resume. No vocabulary anywhere — duration and persistence decide (backchannels are short
4327
- * and stop). Nothing is lost across a pause, so a false positive costs only a brief hold. */
4328
- handleOverlap(rms) {
4329
- const o = this.options;
4330
- if (!this.speaking || !this.overlapCapable) return;
4331
- if (rms < o.overlapRms) return;
4332
- const t = now();
4333
- if (!this.pausedAt) {
4334
- this.loudTimes = this.loudTimes.filter((x) => t - x < 400);
4335
- this.loudTimes.push(t);
4336
- if (this.loudTimes.length < 2) return;
4337
- this.loudTimes = [];
4338
- this.pausedAt = t;
4339
- this.overlapLoud = 2;
4340
- this.overlapLastLoudAt = t;
4341
- this.player.pause();
4342
- this.armResume();
4343
- return;
4344
- }
4345
- if (t - this.overlapLastLoudAt > 450) {
4346
- this.pausedAt = t;
4347
- this.overlapLoud = 1;
4348
- this.overlapLastLoudAt = t;
4349
- this.armResume();
4350
- return;
4351
- }
4352
- this.overlapLastLoudAt = t;
4353
- this.overlapLoud++;
4354
- if (t - this.pausedAt >= o.overlapSustainMs && this.overlapLoud >= 3) {
4355
- const phase = this.ctxOpen ? "speaking" : "drain";
4356
- this.interrupt();
4357
- this.options.onBargeIn(phase);
4358
- return;
4359
- }
4360
- this.armResume();
4361
- }
4362
4425
  armResume() {
4363
4426
  if (this.resumeTimer) clearTimeout(this.resumeTimer);
4364
4427
  this.resumeTimer = setTimeout(() => {
4365
4428
  this.resumeTimer = null;
4366
4429
  if (!this.pausedAt) return;
4430
+ this.stt.reset();
4367
4431
  this.resetOverlap(true);
4368
4432
  }, this.options.overlapResumeMs);
4369
4433
  }
@@ -4374,12 +4438,25 @@ var VoiceEngine = class {
4374
4438
  }
4375
4439
  if (this.pausedAt && resume) this.player.resume?.();
4376
4440
  this.pausedAt = 0;
4377
- this.overlapLoud = 0;
4378
- this.loudTimes = [];
4441
+ this.lastOverlapPartial = "";
4442
+ this.gatePassTimes = [];
4379
4443
  }
4380
4444
  /** energy two-stage barge-in (heuristic tier only): spike over echo baseline → pause + confirm via STT */
4445
+ gatePassTimes = [];
4446
+ // recent gate-PASSING chunks (helper zeroes residue — nonzero = vetted)
4381
4447
  handleLevel(rms) {
4382
- if (this.usingAec) return this.handleOverlap(rms);
4448
+ if (this.usingAec) {
4449
+ if (!this.speaking || !this.overlapCapable || this.pausedAt || rms < 50) return;
4450
+ const t = now();
4451
+ this.gatePassTimes = this.gatePassTimes.filter((x) => t - x < 350);
4452
+ this.gatePassTimes.push(t);
4453
+ if (this.gatePassTimes.length < 2) return;
4454
+ this.gatePassTimes = [];
4455
+ this.pausedAt = t;
4456
+ this.player.pause();
4457
+ this.armResume();
4458
+ return;
4459
+ }
4383
4460
  if (!this.speaking) {
4384
4461
  this.baseline = 0;
4385
4462
  this.hot = 0;
@@ -4419,6 +4496,9 @@ var SonioxSTTOptions = class {
4419
4496
  source;
4420
4497
  model = "stt-rt-preview";
4421
4498
  languageHints = ["en"];
4499
+ /** Client-side endpoint: finalized text + no new tokens for this long = utterance (don't wait for
4500
+ * Soniox's semantic <end>, which adds 0.5-1.5s — the difference between ping-pong and lag). */
4501
+ silenceEndpointMs = 500;
4422
4502
  };
4423
4503
  var SonioxSTT = class {
4424
4504
  options;
@@ -4434,6 +4514,9 @@ var SonioxSTT = class {
4434
4514
  };
4435
4515
  finalText = "";
4436
4516
  partialText = "";
4517
+ lastChangeAt = 0;
4518
+ lastCombined = "";
4519
+ endpointTimer = null;
4437
4520
  constructor(options) {
4438
4521
  this.options = { ...new SonioxSTTOptions(), ...options };
4439
4522
  }
@@ -4470,6 +4553,13 @@ var SonioxSTT = class {
4470
4553
  await this.connectWs();
4471
4554
  if (this.sourceStarted) return;
4472
4555
  this.sourceStarted = true;
4556
+ this.endpointTimer = setInterval(() => {
4557
+ const combined = (this.finalText + this.partialText).trim();
4558
+ if (!combined || now2() - this.lastChangeAt < this.options.silenceEndpointMs) return;
4559
+ this.reset();
4560
+ this.onUtterance(combined, now2());
4561
+ }, 120);
4562
+ this.endpointTimer.unref?.();
4473
4563
  await this.options.source.start((chunk) => {
4474
4564
  let sum = 0;
4475
4565
  const view = new DataView(chunk.buffer, chunk.byteOffset, chunk.byteLength);
@@ -4489,7 +4579,12 @@ var SonioxSTT = class {
4489
4579
  else if (t.is_final) this.finalText += t.text;
4490
4580
  }
4491
4581
  this.partialText = (m.tokens ?? []).filter((t) => !t.is_final && t.text !== "<end>").map((t) => t.text).join("");
4492
- this.onPartial(this.finalText + this.partialText);
4582
+ const combined = this.finalText + this.partialText;
4583
+ if (combined !== this.lastCombined) {
4584
+ this.lastCombined = combined;
4585
+ this.lastChangeAt = now2();
4586
+ }
4587
+ this.onPartial(combined);
4493
4588
  if (endpoint && this.finalText.trim()) {
4494
4589
  const utterance = this.finalText.trim();
4495
4590
  this.reset();
@@ -4499,9 +4594,11 @@ var SonioxSTT = class {
4499
4594
  reset() {
4500
4595
  this.finalText = "";
4501
4596
  this.partialText = "";
4597
+ this.lastCombined = "";
4502
4598
  }
4503
4599
  stop() {
4504
4600
  this.stopped = true;
4601
+ if (this.endpointTimer) clearInterval(this.endpointTimer);
4505
4602
  this.options.source?.stop();
4506
4603
  if (this.ws) this.ws.onclose = null;
4507
4604
  this.ws?.close();
@@ -4532,7 +4629,15 @@ var CartesiaTTS = class {
4532
4629
  constructor(options) {
4533
4630
  this.options = { ...new CartesiaTTSOptions(), ...options };
4534
4631
  }
4632
+ closed = false;
4633
+ connecting = null;
4535
4634
  async connect() {
4635
+ this.closed = false;
4636
+ this.connecting = this.doConnect();
4637
+ await this.connecting;
4638
+ this.connecting = null;
4639
+ }
4640
+ async doConnect() {
4536
4641
  const key = await resolveAuth(this.options.auth);
4537
4642
  const param = this.options.authMode === "token" ? "access_token" : "api_key";
4538
4643
  this.ws = new WebSocket(`wss://api.cartesia.ai/tts/websocket?cartesia_version=2026-03-01&${param}=${key}`);
@@ -4540,7 +4645,12 @@ var CartesiaTTS = class {
4540
4645
  this.ws.onopen = () => res();
4541
4646
  this.ws.onerror = (e) => rej(new Error(`cartesia ws: ${e.message || "connect failed"}`));
4542
4647
  });
4543
- this.ws.onclose = (ev) => log10.warn(`cartesia ws closed (${ev.code} ${ev.reason || ""})`);
4648
+ this.ws.onclose = (ev) => {
4649
+ log10.warn(`cartesia ws closed (${ev.code} ${ev.reason || ""})`);
4650
+ if (!this.closed) {
4651
+ this.connecting = this.doConnect().catch((e) => log10.error(`cartesia reconnect failed: ${e.message}`));
4652
+ }
4653
+ };
4544
4654
  this.ws.onmessage = (ev) => {
4545
4655
  const m = JSON.parse(String(ev.data));
4546
4656
  if (m.context_id && m.context_id !== this.ctxId) return;
@@ -4551,6 +4661,11 @@ var CartesiaTTS = class {
4551
4661
  else if (m.type === "error" && !/already been cancelled|does not exist/.test(m.message || "")) log10.warn(`cartesia: ${JSON.stringify(m)}`);
4552
4662
  };
4553
4663
  }
4664
+ /** Ensure the WS is open before sending — reconnects if idle-closed. */
4665
+ async ensureConnected() {
4666
+ if (this.connecting) await this.connecting;
4667
+ if (this.ws?.readyState !== WebSocket.OPEN) await this.connect();
4668
+ }
4554
4669
  newContext() {
4555
4670
  this.ctxId = `ctx-${++this.ctxSeq}`;
4556
4671
  this.firstAudioAt = 0;
@@ -4568,6 +4683,7 @@ var CartesiaTTS = class {
4568
4683
  }
4569
4684
  speak(text, cont) {
4570
4685
  if (this.ws?.readyState === WebSocket.OPEN) this.ws.send(this.frame(text, cont));
4686
+ else void this.ensureConnected().then(() => this.ws?.readyState === WebSocket.OPEN && this.ws.send(this.frame(text, cont)));
4571
4687
  }
4572
4688
  end() {
4573
4689
  if (this.ws?.readyState === WebSocket.OPEN) this.ws.send(this.frame("", false));
@@ -4576,6 +4692,7 @@ var CartesiaTTS = class {
4576
4692
  if (this.ws?.readyState === WebSocket.OPEN) this.ws.send(JSON.stringify({ context_id: this.ctxId, cancel: true }));
4577
4693
  }
4578
4694
  close() {
4695
+ this.closed = true;
4579
4696
  if (this.ws) this.ws.onclose = null;
4580
4697
  this.ws?.close();
4581
4698
  }
@@ -4607,6 +4724,7 @@ export {
4607
4724
  JailOptions,
4608
4725
  JailedFilesystem,
4609
4726
  LessonOptionsDefaults,
4727
+ MEMORY_PROMPT,
4610
4728
  MemFilesystem3 as MemFilesystem,
4611
4729
  MountFilesystem,
4612
4730
  NodeDiskFilesystem,
@@ -4621,6 +4739,7 @@ export {
4621
4739
  SonioxSTT,
4622
4740
  SonioxSTTOptions,
4623
4741
  TTS_SAMPLE_RATE,
4742
+ VOICE_MEMORY_PROMPT,
4624
4743
  VOICE_SYSTEM_PROMPT,
4625
4744
  VoiceEngine,
4626
4745
  VoiceEngineOptions,