npm - reasonix - Versions diffs - 0.47.0 → 0.47.1 - Mend

reasonix 0.47.0 → 0.47.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (108) hide show

package/dist/index.js CHANGED Viewed

@@ -579,6 +579,7 @@ function webSearchEngine(path2 = defaultConfigPath()) {
   const cfg = readConfig(path2).webSearchEngine;
   if (cfg === "searxng") return "searxng";
   if (cfg === "metaso") return "metaso";
+  if (cfg === "tavily") return "tavily";
   return "mojeek";
 }
 function webSearchEndpoint(path2 = defaultConfigPath()) {
@@ -805,7 +806,7 @@ var DeepSeekClient = class {
     if (opts.temperature !== void 0) payload.temperature = opts.temperature;
     if (opts.maxTokens !== void 0) payload.max_tokens = opts.maxTokens;
     if (opts.responseFormat) payload.response_format = opts.responseFormat;
-    if (opts.thinking) {
+    if (opts.thinking && !this._isAzureEndpoint()) {
       payload.extra_body = { thinking: { type: opts.thinking } };
     }
     if (opts.reasoningEffort) {
@@ -813,6 +814,17 @@ var DeepSeekClient = class {
     }
     return payload;
   }
+  /** Azure OpenAI-compatible endpoints do not accept DeepSeek's proprietary
+   *  `extra_body.thinking` field (they reject the request with 400).  We still
+   *  send `reasoning_effort`, which Azure *does* support. */
+  _isAzureEndpoint() {
+    try {
+      const host = new URL(this.baseUrl).hostname;
+      return host === "azure.com" || host.endsWith(".azure.com");
+    } catch {
+      return false;
+    }
+  }
   /** Returns null on failure so callers can degrade — session must keep working without balance UI. */
   async getBalance(opts = {}) {
     try {
@@ -1543,6 +1555,7 @@ var EN = {
     reviewSaveError: "Could not save config: {message}",
     reviewFooter: "[Enter] save \xB7 [Esc] cancel",
     savedTitle: "\u25B8 Saved.",
+    savedShellHint: "Shell commands the model wants to run ask each time \u2014 pick `allow always` on the prompt to whitelist that exact command for this project. No global allow-all flag by design.",
     savedFooter: "[Enter] to exit",
     selectFooter: "[\u2191\u2193] navigate \xB7 [Enter] confirm \xB7 [Esc] cancel",
     stepCounter: "Step {step}/{total} \xB7 ",
@@ -1607,6 +1620,8 @@ var EN = {
       title: "Checkpoint \u2014 step done",
       continue: "Continue \u2014 run the next step",
       continueHint: "Model resumes with the next step.",
+      finish: "Finish \u2014 summarize and close",
+      finishHint: "Model records the final step and summarizes the completed plan.",
       revise: "Revise \u2014 give feedback before the next step",
       reviseHint: "Stay paused, type guidance; model adjusts the remaining plan.",
       stop: "Stop \u2014 end the plan here",
@@ -1747,6 +1762,10 @@ var EN = {
       helpShellDetail: "                             the conversation so the model sees it next turn.",
       helpShellConsent: "                             No allowlist gate \u2014 user-typed = explicit consent.",
       helpShellExample: "                             Example: !git status   !ls src/   !npm test",
+      helpShellGateTitle: "Model-invoked shell commands (per-call approval):",
+      helpShellGate: "  \u2191\u2193 + \u23CE                   each call shows a prompt with `allow once` / `allow always`",
+      helpShellGateDetail: "                             / `deny`. Pick `allow always` to whitelist that exact",
+      helpShellGatePolicy: "                             command prefix for this project. No global allow-all flag.",
       helpMemoryTitle: "Quick memory:",
       helpMemoryPin: "  #<note>                  append <note> to <project>/REASONIX.md (committable).",
       helpMemoryPinEx: "                             Example: #findByEmail must be case-insensitive",
@@ -2058,7 +2077,7 @@ var EN = {
       changesNoteShort: "Changes take effect on next /new or launch. Subcommands: /memory list | show | forget | clear"
     },
     mcp: {
-      noServers: 'no MCP servers attached. Run `reasonix setup` to pick some, or launch with --mcp "<spec>". `reasonix mcp list` shows the catalog.',
+      noServers: 'no MCP servers attached. Run `reasonix setup` to pick some, or launch with --mcp "<spec>". `reasonix mcp list` shows the catalog. Note: model-invoked shell commands are gated per-call (allow once / allow always / deny) \u2014 no global allow-all flag.',
       toolsLabel: "  tools     {count}",
       resourcesHint: "`/resource` to browse+read",
       promptsHint: "`/prompt` to browse+fetch",
@@ -2489,7 +2508,7 @@ var EN = {
     slow: "slow \xB7 {ms}ms",
     verySlow: "very slow \xB7 {ms}ms",
     slowToast: "\u26A0 MCP `{name}` slow \xB7 {seconds}s p95 over the last {sampleSize} calls",
-    emptyHint: "\u2139 no MCP servers configured \u2014 try: `reasonix setup` to re-pick, or `reasonix mcp install filesystem`"
+    emptyHint: "\u2139 no MCP servers configured \u2014 try: `reasonix setup` to re-pick, or `reasonix mcp install filesystem` \xB7 shell commands gate per-call (allow once / allow always / deny), no global allow-all"
   },
   denyContextInput: {
     description: "Tell the agent why you denied this. The next attempt will see your reason as additional context."
@@ -3050,6 +3069,7 @@ var zhCN = {
     reviewSaveError: "\u4FDD\u5B58\u914D\u7F6E\u5931\u8D25\uFF1A{message}",
     reviewFooter: "[Enter] \u4FDD\u5B58 \xB7 [Esc] \u53D6\u6D88",
     savedTitle: "\u25B8 \u5DF2\u4FDD\u5B58\u3002",
+    savedShellHint: "\u6A21\u578B\u53D1\u8D77\u7684 shell \u547D\u4EE4\u6BCF\u6B21\u90FD\u4F1A\u5F39\u51FA\u786E\u8BA4 \u2014\u2014 \u5728\u63D0\u793A\u6846\u91CC\u9009 `allow always` \u53EF\u5C06\u8BE5\u547D\u4EE4\u524D\u7F00\u52A0\u5165\u672C\u9879\u76EE\u767D\u540D\u5355\u3002\u8BBE\u8BA1\u4E0A\u6CA1\u6709\u300C\u5168\u5C40\u653E\u884C\u300D\u5F00\u5173\u3002",
     savedFooter: "[Enter] \u9000\u51FA",
     selectFooter: "[\u2191\u2193] \u79FB\u52A8 \xB7 [Enter] \u786E\u8BA4 \xB7 [Esc] \u53D6\u6D88",
     stepCounter: "\u6B65\u9AA4 {step}/{total} \xB7 ",
@@ -3114,6 +3134,8 @@ var zhCN = {
       title: "\u68C0\u67E5\u70B9 \u2014\u2014 \u5F53\u524D\u6B65\u9AA4\u5DF2\u5B8C\u6210",
       continue: "\u7EE7\u7EED \u2014\u2014 \u6267\u884C\u4E0B\u4E00\u6B65",
       continueHint: "\u6A21\u578B\u4ECE\u4E0B\u4E00\u6B65\u7EE7\u7EED\u3002",
+      finish: "\u5B8C\u6210 \u2014\u2014 \u603B\u7ED3\u5E76\u6536\u5C3E",
+      finishHint: "\u6A21\u578B\u8BB0\u5F55\u6700\u540E\u4E00\u6B65\uFF0C\u7136\u540E\u603B\u7ED3\u5DF2\u5B8C\u6210\u7684\u8BA1\u5212\u3002",
       revise: "\u8C03\u6574 \u2014\u2014 \u5728\u4E0B\u4E00\u6B65\u524D\u7ED9\u53CD\u9988",
       reviseHint: "\u5148\u6682\u505C\uFF0C\u8F93\u5165\u6307\u5F15\uFF1B\u6A21\u578B\u4F1A\u8C03\u6574\u5269\u4F59\u8BA1\u5212\u3002",
       stop: "\u505C\u6B62 \u2014\u2014 \u5728\u6B64\u7ED3\u675F\u8BA1\u5212",
@@ -3254,6 +3276,10 @@ var zhCN = {
       helpShellDetail: "                             \u4EE5\u4FBF\u6A21\u578B\u5728\u4E0B\u4E00\u8F6E\u770B\u5230\u3002\u65E0\u5141\u8BB8\u5217\u8868\u9650\u5236\u3002",
       helpShellConsent: "                             \u7528\u6237\u8F93\u5165 = \u660E\u786E\u540C\u610F\u3002",
       helpShellExample: "                             \u793A\u4F8B\uFF1A!git status   !ls src/   !npm test",
+      helpShellGateTitle: "\u6A21\u578B\u53D1\u8D77\u7684 shell \u547D\u4EE4\uFF08\u6309\u6B21\u5BA1\u6279\uFF09\uFF1A",
+      helpShellGate: "  \u2191\u2193 + \u23CE                   \u6BCF\u6B21\u90FD\u4F1A\u5F39\u51FA `allow once` / `allow always` /",
+      helpShellGateDetail: "                             `deny` \u4E09\u9009\u4E00\u3002\u9009 `allow always` \u53EF\u5C06\u8BE5\u547D\u4EE4\u524D\u7F00",
+      helpShellGatePolicy: "                             \u52A0\u5165\u672C\u9879\u76EE\u767D\u540D\u5355\u3002\u8BBE\u8BA1\u4E0A\u6CA1\u6709\u300C\u5168\u5C40\u653E\u884C\u300D\u5F00\u5173\u3002",
       helpMemoryTitle: "\u5FEB\u901F\u8BB0\u5FC6\uFF1A",
       helpMemoryPin: "  #<note>                  \u5C06 <note> \u8FFD\u52A0\u5230 <project>/REASONIX.md\uFF08\u53EF\u63D0\u4EA4\uFF09\u3002",
       helpMemoryPinEx: "                             \u793A\u4F8B\uFF1A#findByEmail \u5FC5\u987B\u533A\u5206\u5927\u5C0F\u5199",
@@ -3565,7 +3591,7 @@ var zhCN = {
       changesNoteShort: "\u66F4\u6539\u5728\u4E0B\u6B21 /new \u6216\u542F\u52A8\u65F6\u751F\u6548\u3002\u5B50\u547D\u4EE4\uFF1A/memory list | show | forget | clear"
     },
     mcp: {
-      noServers: '\u672A\u9644\u52A0 MCP \u670D\u52A1\u5668\u3002\u8FD0\u884C `reasonix setup` \u9009\u62E9\u4E00\u4E9B\uFF0C\u6216\u4F7F\u7528 --mcp "<spec>" \u542F\u52A8\u3002`reasonix mcp list` \u663E\u793A\u76EE\u5F55\u3002',
+      noServers: '\u672A\u9644\u52A0 MCP \u670D\u52A1\u5668\u3002\u8FD0\u884C `reasonix setup` \u9009\u62E9\u4E00\u4E9B\uFF0C\u6216\u4F7F\u7528 --mcp "<spec>" \u542F\u52A8\u3002`reasonix mcp list` \u663E\u793A\u76EE\u5F55\u3002\u6CE8\uFF1A\u6A21\u578B\u53D1\u8D77\u7684 shell \u547D\u4EE4\u6309\u6B21\u5BA1\u6279\uFF08allow once / allow always / deny\uFF09\uFF0C\u8BBE\u8BA1\u4E0A\u6CA1\u6709\u300C\u5168\u5C40\u653E\u884C\u300D\u5F00\u5173\u3002',
       toolsLabel: "  \u5DE5\u5177     {count}",
       resourcesHint: "`/resource` \u6D4F\u89C8+\u8BFB\u53D6",
       promptsHint: "`/prompt` \u6D4F\u89C8+\u83B7\u53D6",
@@ -3996,7 +4022,7 @@ var zhCN = {
     slow: "\u7F13\u6162 \xB7 {ms}ms",
     verySlow: "\u975E\u5E38\u6162 \xB7 {ms}ms",
     slowToast: "\u26A0 MCP `{name}` \u54CD\u5E94\u7F13\u6162 \xB7 P95 {seconds}s \xB7 \u6700\u8FD1 {sampleSize} \u6B21\u8C03\u7528",
-    emptyHint: "\u2139 \u672A\u914D\u7F6E MCP \u670D\u52A1\u5668 \u2014\u2014 \u53EF\u5C1D\u8BD5\uFF1A`reasonix setup` \u91CD\u65B0\u9009\u62E9\uFF0C\u6216 `reasonix mcp install filesystem`"
+    emptyHint: "\u2139 \u672A\u914D\u7F6E MCP \u670D\u52A1\u5668 \u2014\u2014 \u53EF\u5C1D\u8BD5\uFF1A`reasonix setup` \u91CD\u65B0\u9009\u62E9\uFF0C\u6216 `reasonix mcp install filesystem` \xB7 shell \u547D\u4EE4\u6309\u6B21\u5BA1\u6279\uFF08allow once / allow always / deny\uFF09\uFF0C\u65E0\u5168\u5C40\u653E\u884C"
   },
   denyContextInput: {
     description: "\u544A\u8BC9\u6A21\u578B\u4F60\u4E3A\u4EC0\u4E48\u62D2\u7EDD\u4E86\u3002\u6A21\u578B\u4E0B\u6B21\u4F1A\u770B\u5230\u4F60\u7684\u7406\u7531\u4F5C\u4E3A\u989D\u5916\u7684\u4E0A\u4E0B\u6587\u3002"
@@ -4795,10 +4821,13 @@ var ToolRegistry = class {
   _autoFlatten;
   _planMode = false;
   _interceptor = null;
+  _interceptors = [];
   _auditListener = null;
   _resultAugmenter = null;
   /** Per-tool fingerprint of the last call that failed schema validation. Cleared by any successful validation for that tool. */
   _lastMalformed = /* @__PURE__ */ new Map();
+  /** Per-tool fingerprint of the last host-side interceptor rejection. */
+  _lastInterceptorRejection = /* @__PURE__ */ new Map();
   constructor(opts = {}) {
     this._autoFlatten = opts.autoFlatten !== false;
   }
@@ -4814,6 +4843,18 @@ var ToolRegistry = class {
   setToolInterceptor(fn) {
     this._interceptor = fn;
   }
+  /** Ordered host-side interceptors. They run before the legacy single interceptor. */
+  addToolInterceptor(id, fn) {
+    const normalized = id.trim();
+    if (!normalized) throw new Error("tool interceptor requires a non-empty id");
+    const existing = this._interceptors.findIndex((entry) => entry.id === normalized);
+    if (existing >= 0) this._interceptors.splice(existing, 1);
+    this._interceptors.push({ id: normalized, fn });
+    return () => {
+      const idx = this._interceptors.findIndex((entry) => entry.id === normalized);
+      if (idx >= 0) this._interceptors.splice(idx, 1);
+    };
+  }
   setAuditListener(fn) {
     this._auditListener = fn;
   }
@@ -4902,16 +4943,21 @@ var ToolRegistry = class {
         rejectedReason: "plan-mode"
       });
     }
-    if (this._interceptor) {
+    const chain = this._interceptor ? [...this._interceptors.map((entry) => entry.fn), this._interceptor] : this._interceptors.map((entry) => entry.fn);
+    for (const interceptor of chain) {
       try {
-        const short = await this._interceptor(name, args);
-        if (typeof short === "string") return short;
+        const short = await interceptor(name, args);
+        if (typeof short === "string") {
+          const guarded = this._noteInterceptorRejection(name, fingerprint, short);
+          return this._augmentResult(name, args, guarded);
+        }
       } catch (err) {
         return JSON.stringify({
           error: `${name}: interceptor failed \u2014 ${err.message}`
         });
       }
     }
+    this._lastInterceptorRejection.delete(name);
     if (opts.signal?.aborted) {
       return JSON.stringify({
         error: `${name}: aborted before dispatch (user interrupt)`,
@@ -4949,13 +4995,16 @@ var ToolRegistry = class {
         finalResult = JSON.stringify({ error: `${e.name}: ${e.message}` });
       }
     }
+    return this._augmentResult(name, args, finalResult);
+  }
+  _augmentResult(name, args, result) {
     if (this._resultAugmenter) {
       try {
-        return this._resultAugmenter(name, args, finalResult);
+        return this._resultAugmenter(name, args, result);
       } catch {
       }
     }
-    return finalResult;
+    return result;
   }
   /** Records the failed call's fingerprint; on the 2nd consecutive identical malformed call to the same tool, returns a sharper error that tells the model to stop retrying. */
   _noteMalformed(name, fingerprint, detail) {
@@ -4969,7 +5018,35 @@ var ToolRegistry = class {
     }
     return JSON.stringify({ error: `${name}: ${detail}` });
   }
+  _noteInterceptorRejection(name, fingerprint, result) {
+    const reason = rejectedReason(result);
+    if (!reason) {
+      this._lastInterceptorRejection.delete(name);
+      return result;
+    }
+    const key = `${reason}:${fingerprint}`;
+    const prev = this._lastInterceptorRejection.get(name);
+    this._lastInterceptorRejection.set(name, key);
+    if (prev === key) {
+      return JSON.stringify({
+        error: `${name}: same call was just rejected by ${reason} \u2014 do not retry identical args. Switch to read-only exploration, submit or revise the plan, or choose a different tool call.`,
+        rejectedReason: reason,
+        consecutiveInterceptorRejection: true
+      });
+    }
+    return result;
+  }
 };
+function rejectedReason(result) {
+  try {
+    const parsed = JSON.parse(result);
+    if (!parsed || typeof parsed !== "object") return null;
+    const reason = parsed.rejectedReason;
+    return typeof reason === "string" && reason ? reason : null;
+  } catch {
+    return null;
+  }
+}
 function isReadOnlyCall(tool, args) {
   if (tool.readOnlyCheck) {
     try {
@@ -9045,7 +9122,7 @@ async function applyMultiEdit(rootDir, edits) {
         );
       }
       const le = before.includes("\r\n") ? "\r\n" : "\n";
-      state = { buf: before, le, hunks: [], deltaChars: 0, touched: 0 };
+      state = { before, buf: before, le, hunks: [], deltaChars: 0, touched: 0 };
       filesByPath.set(e.abs, state);
     }
     const adaptedSearch = e.search.replace(/\r?\n/g, state.le);
@@ -9053,7 +9130,7 @@ async function applyMultiEdit(rootDir, edits) {
     const firstIdx = state.buf.indexOf(adaptedSearch);
     if (firstIdx < 0) {
       throw new Error(
-        `multi_edit: edit #${i + 1} search text not found in ${rel} \u2014 no edits applied (multi_edit is atomic)`
+        `multi_edit: edit #${i + 1} search text not found in ${rel} \u2014 no edits applied`
       );
     }
     const nextIdx = state.buf.indexOf(adaptedSearch, firstIdx + 1);
@@ -9069,8 +9146,29 @@ ${renderEditDiff(adaptedSearch, adaptedReplace, startLine)}`);
     state.deltaChars += adaptedReplace.length - adaptedSearch.length;
     state.touched++;
   }
-  for (const [abs, state] of filesByPath) {
-    await fs.writeFile(abs, state.buf, "utf8");
+  const attempted = [];
+  try {
+    for (const [abs, state] of filesByPath) {
+      attempted.push({ abs, before: state.before });
+      await fs.writeFile(abs, state.buf, "utf8");
+    }
+  } catch (writeErr) {
+    const rollbackFailures = [];
+    for (const item of [...attempted].reverse()) {
+      try {
+        await fs.writeFile(item.abs, item.before, "utf8");
+      } catch (restoreErr) {
+        rollbackFailures.push(`${displayRel(rootDir, item.abs)}: ${restoreErr.message}`);
+      }
+    }
+    if (rollbackFailures.length > 0) {
+      throw new Error(
+        `multi_edit: write failed after partial application: ${writeErr.message}; rollback failed for ${rollbackFailures.join("; ")}`
+      );
+    }
+    throw new Error(
+      `multi_edit: write failed: ${writeErr.message}; rolled back all files that may have been modified`
+    );
   }
   const fileCount = filesByPath.size;
   const editCount = edits.length;
@@ -9739,7 +9837,7 @@ async function searchContent(ctx, startAbs, args) {
 }
 // src/tools/filesystem.ts
-var DEFAULT_OUTLINE_THRESHOLD_BYTES = 512 * 1024;
+var DEFAULT_OUTLINE_THRESHOLD_BYTES = 64 * 1024;
 var DEFAULT_MAX_LIST_BYTES = 256 * 1024;
 var HARD_MAX_FILE_BYTES = 32 * 1024 * 1024;
 var OUTLINE_HEAD_LINES = 80;
@@ -9881,11 +9979,7 @@ ${body}`;
   registry.register({
     name: "read_file",
     parallelSafe: true,
-    description: `Read a file under the sandbox root. Default behaviour returns FULL CONTENT for files at or under ${Math.round(DEFAULT_OUTLINE_THRESHOLD_BYTES / 1024)} KiB \u2014 trust the prompt cache, don't pre-truncate. Optional scoping:
-  - head: N  \u2192 first N lines (cheap probe of imports / config head)
-  - tail: N  \u2192 last N lines (recent-tail of a log)
-  - range: "A-B"  \u2192 inclusive 1-indexed range (e.g. "120-180" around an edit site)
-Files OVER the threshold auto-switch to outline mode: file metadata + first ${OUTLINE_HEAD_LINES} lines + a top-level symbol outline (TS/JS exports, Python def/class, Go func/type, Rust fn/struct/impl/trait, Markdown headings, Protobuf message/service/rpc, plain-text chapter markers) + concrete next-step commands. No middle bytes \u2014 drill in with range / search_content. Files over ${Math.round(HARD_MAX_FILE_BYTES / (1024 * 1024))} MiB are refused entirely (use grep / range). Binary files are refused \u2014 use get_file_info if you only need stat.`,
+    description: `Read a file under the sandbox root. Default returns FULL CONTENT for files \u2264 ${Math.round(DEFAULT_OUTLINE_THRESHOLD_BYTES / 1024)} KiB. Optional scoping: head/tail (N lines), range "A-B" (1-indexed inclusive). Larger files auto-switch to outline mode (metadata + head + symbol outline for TS/JS/Python/Go/Rust/Markdown/Protobuf/text) \u2014 drill in with range or search_content. Files over ${Math.round(HARD_MAX_FILE_BYTES / (1024 * 1024))} MiB and binaries are refused \u2014 use get_file_info for stat.`,
     readOnly: true,
     stormExempt: true,
     parameters: {
@@ -10003,11 +10097,7 @@ ${slice.join("\n")}`);
   registry.register({
     name: "directory_tree",
     parallelSafe: true,
-    description: `Recursively list entries in a directory. Shows indented tree structure with directories marked '/'. Budget-aware by default:
-  - maxDepth defaults to 2 (root + one level). A depth-4 tree on a real repo blew ~5K tokens in one call. If you truly need deeper, pass maxDepth:N explicitly.
-  - Skips ${[...SKIP_DIR_NAMES].sort().join(", ")} unless include_deps:true. Traversing into node_modules / .git / dist is almost always token-waste.
-  - Large subtrees (>50 children) auto-collapse to "[N files, M dirs hidden \u2014 list_directory <path> to inspect]" so one huge folder can't dominate the output.
-Prefer \`list_directory\` for a single-level view, \`search_files\` to find specific paths, and \`search_content\` to find code.`,
+    description: `Recursively list entries with indented tree structure (dirs marked '/'). Budget-aware: maxDepth defaults to 2, large subtrees (>50 children) auto-collapse to "[N hidden \u2014 list_directory to inspect]", and ${[...SKIP_DIR_NAMES].sort().join(" / ")} are skipped unless include_deps:true. For single-level use list_directory; for path lookups use search_files; for code lookups use search_content.`,
     readOnly: true,
     parameters: {
       type: "object",
@@ -10108,38 +10198,38 @@ Prefer \`list_directory\` for a single-level view, \`search_files\` to find spec
   registry.register({
     name: "search_content",
     parallelSafe: true,
-    description: "Recursively grep file CONTENTS for a substring or regex. This is the right tool for 'find all places that call X', 'where is Y referenced', 'what files contain Z'. Different from search_files (which matches FILE NAMES). Returns one match per line in 'path:line: text' format. Per-file hits are capped at 30 (a footer reports any extras); when the byte budget is mostly spent the remaining files switch to a 'rel: N matches' histogram so distribution stays visible instead of one popular file drowning the rest. Pass `summary_only:true` to skip line content entirely and get just the histogram. Skips dependency / VCS / build directories (node_modules, .git, dist, build, .next, target, .venv) and binary files by default.",
+    description: "Recursively grep file CONTENTS for a substring or regex \u2014 'where is X called', 'what files contain Y'. Returns one match per line as `path:line: text`. Per-file hit cap 30; when the byte budget is mostly spent, remaining files switch to a `rel: N matches` histogram. Pass `summary_only:true` for just the histogram. Skips dependency / VCS / build dirs and binary files. For file NAMES use search_files.",
     readOnly: true,
     parameters: {
       type: "object",
       properties: {
         pattern: {
           type: "string",
-          description: "Substring (or regex) to search file contents for."
+          description: "Substring or regex."
         },
         path: {
           type: "string",
-          description: "Directory to start the search at (default: sandbox root)."
+          description: "Search root (default: sandbox root)."
         },
         glob: {
           type: "string",
-          description: "Optional filename filter. Real glob when the value contains `*`, `?`, `{`, or `[` \u2014 e.g. '*.ts', '**/*.tsx', 'src/**/*.{ts,tsx}'. Plain substring otherwise \u2014 e.g. '.ts' (suffix), 'test' (anywhere in the name). Patterns containing `/` match against the path relative to the search root; otherwise just the basename."
+          description: "Filename filter. Glob when it contains `*`/`?`/`{`/`[`; otherwise substring. Patterns with `/` match the path relative to the search root."
         },
         case_sensitive: {
           type: "boolean",
-          description: "When true, match case exactly. Default false (case-insensitive)."
+          description: "Default false."
         },
         include_deps: {
           type: "boolean",
-          description: "When true, also search inside node_modules / .git / dist / build / etc. Off by default \u2014 most exploration questions are about the user's own code."
+          description: "Also search node_modules / .git / dist / build / etc. Default off."
         },
         context: {
           type: "integer",
-          description: "Lines of context to show around each match (both before and after). Default 0 (just the matching line). Capped at 20. Output uses ripgrep style: `:` after the line number on the matching line, `-` on context lines, `--` separating non-adjacent windows."
+          description: "Lines of context around each match (both sides). Default 0, capped 20. Ripgrep-style output."
         },
         summary_only: {
           type: "boolean",
-          description: "When true, skip line content and return one 'rel: N matches' line per matching file. Use for 'where does this exist at all' questions before drilling in with a targeted read_file."
+          description: "Skip line content, return `rel: N matches` per file. Use for 'where does this exist at all' before drilling in."
         }
       },
       required: ["pattern"]
@@ -10252,7 +10342,7 @@ Prefer \`list_directory\` for a single-level view, \`search_files\` to find spec
   });
   registry.register({
     name: "multi_edit",
-    description: "Apply N SEARCH/REPLACE edits across ONE OR MORE files in a single atomic call. Edits run sequentially in array order; for edits that touch the same file, a later edit can match text inserted by an earlier one. If ANY edit fails (search not found, ambiguous match, empty search, file unreadable), NO files are written \u2014 atomic at the validation layer. Same per-edit rules as edit_file: `search` is exact text (whitespace sensitive, no regex) and must be unique in its target file at the moment that edit applies. Use this for renames spanning multiple files, cross-file refactors, or any batch where you'd otherwise loop edit_file.",
+    description: "Apply N SEARCH/REPLACE edits across ONE OR MORE files in one call. Edits validate across the full batch before writing. Validation failures leave all files untouched; disk write failures trigger best-effort rollback of files that may have been modified. Per-file edits run in array order, so a later edit can match text inserted by an earlier one. Same per-edit rules as edit_file: `search` is exact text (whitespace sensitive, no regex) and must be unique in its target file at the moment that edit applies. Use this for renames spanning multiple files, cross-file refactors, or any batch where you'd otherwise loop edit_file.",
     parameters: {
       type: "object",
       properties: {
@@ -10409,7 +10499,7 @@ function registerMemoryTools(registry, opts = {}) {
   }
   registry.register({
     name: "remember",
-    description: "Save a memory for future sessions. Use when the user states a preference, corrects your approach, shares a non-obvious fact about this project, or explicitly asks you to remember something. Don't remember transient task state \u2014 only things worth recalling next session. The memory is written now but won't re-load into the system prompt until the next `/new` or launch.",
+    description: "Save a memory for future sessions \u2014 preferences, corrections, non-obvious project facts. Not for transient task state. Loads into the system prompt on next `/new` or launch.",
     parameters: {
       type: "object",
       properties: {
@@ -10420,29 +10510,29 @@ function registerMemoryTools(registry, opts = {}) {
         scope: {
           type: "string",
           enum: ["global", "project"],
-          description: "'global' = applies across every project (preferences, tooling); 'project' = scoped to the current sandbox (decisions, local facts). Only available in `reasonix code`."
+          description: "global = across all projects; project = current sandbox only (needs `reasonix code`)."
         },
         name: {
           type: "string",
-          description: "filename-safe identifier, 3-40 chars, alnum + _ - . (no path separators, no leading dot)."
+          description: "Filename-safe id, 3-40 chars, alnum + _ - . (no separators, no leading dot)."
         },
         description: {
           type: "string",
-          description: "One-line summary shown in MEMORY.md (under ~150 chars)."
+          description: "\u2264150 char one-liner shown in MEMORY.md."
         },
         content: {
           type: "string",
-          description: "Full memory body in markdown. For feedback/project types, structure as: rule/fact, then **Why:** line, then **How to apply:** line."
+          description: "Markdown body. For feedback/project, structure as rule + **Why:** + **How to apply:**."
         },
         priority: {
           type: "string",
           enum: ["low", "medium", "high"],
-          description: "Optional per-memory priority. `high` injects the entry into a `# HIGH PRIORITY constraints` block at the top of the system prompt \u2014 use sparingly, only for hard rules the model must never violate."
+          description: "`high` injects entry into HIGH PRIORITY block \u2014 use sparingly."
         },
         expires: {
           type: "string",
           enum: ["project_end"],
-          description: "Optional lifecycle hint. `project_end` causes `/memory clear project` to also remove this entry even when it's stored at global scope."
+          description: "`project_end` lets /memory clear project remove this even at global scope."
         }
       },
       required: ["type", "scope", "name", "description", "content"]
@@ -10581,26 +10671,26 @@ function sanitizeOptions(raw) {
 function registerChoiceTool(registry, opts = {}) {
   registry.register({
     name: "ask_choice",
-    description: "Present 2\u20136 alternatives to the user. The principle: if the user is supposed to pick, the tool picks \u2014 you don't enumerate the choices as prose. Prose menus have no picker in this TUI, so the user gets a wall of text to scroll through and a letter to type, strictly worse than the magenta picker this tool renders. Call it whenever (a) the user has asked for options, (b) you've analyzed multiple approaches and the final call is theirs, or (c) it's a preference fork you can't resolve without them. Skip it when one option is clearly best (just do it, or submit_plan) or a free-form text answer fits (ask in prose). Keep option ids short and stable (A/B/C). Each option: title + optional summary. allowCustom=true when their real answer might not fit. Max 6 options \u2014 narrow first if more. A one-sentence lead-in before the call is fine; don't repeat the options in it.",
+    description: "Render an arrow-key picker with 2\u20136 alternatives. Use when the user is supposed to pick \u2014 never enumerate choices as prose. Skip when one option is clearly best (just do it) or a free-form text answer fits. Max 6 options; set `allowCustom:true` when their real answer might not fit.",
     readOnly: true,
     parameters: {
       type: "object",
       properties: {
         question: {
           type: "string",
-          description: "The question to put in front of the user. One sentence. Don't repeat the options in the question text \u2014 the picker renders them separately."
+          description: "One-sentence question. Don't repeat the options here \u2014 the picker renders them."
         },
         options: {
           type: "array",
-          description: "2\u20134 alternatives. Each needs a stable id and a short title; summary is optional.",
+          description: "2\u20136 alternatives. Each: stable id + short title; summary optional.",
           items: {
             type: "object",
             properties: {
-              id: { type: "string", description: "Short stable id (A, B, C, or option-1)." },
-              title: { type: "string", description: "One-line title shown as the option label." },
+              id: { type: "string", description: "Stable id (A, B, C or option-1)." },
+              title: { type: "string", description: "One-line label." },
               summary: {
                 type: "string",
-                description: "Optional. A second dimmed line with more detail. Keep under ~80 chars."
+                description: "Optional dimmed second line, \u226480 chars."
               }
             },
             required: ["id", "title"]
@@ -10608,7 +10698,7 @@ function registerChoiceTool(registry, opts = {}) {
         },
         allowCustom: {
           type: "boolean",
-          description: "If true, the picker shows a 'Let me type my own answer' escape hatch. Default false. Turn on when the user's real answer might not fit any of your pre-defined options."
+          description: "Shows a 'type my own answer' escape hatch. Default false."
         }
       },
       required: ["question", "options"]
@@ -10694,19 +10784,33 @@ var PlanRevisionProposedError = class extends Error {
 };
 // src/tools/plan-core.ts
-var SUBMIT_PLAN_DESCRIPTION = "Submit ONE concrete plan you've already decided on. Use this for tasks that warrant a review gate \u2014 multi-file refactors, architecture changes, anything that would be expensive or confusing to undo. Skip it for small fixes (one-line typo, obvious bug with a clear fix) \u2014 just make the change. The user will either approve (you then implement it), ask for refinement, or cancel. If the user has already enabled /plan mode, writes are blocked at dispatch and you MUST use this. CRITICAL: do NOT use submit_plan to present alternative routes (A/B/C, option 1/2/3) for the user to pick from \u2014 the picker only exposes approve/refine/cancel, so a menu plan strands the user with no way to choose. For branching decisions, call `ask_choice` instead; only call submit_plan once the user has picked a direction and you have a single actionable plan. Write the plan as markdown with a one-line summary, a bulleted list of files to touch and what will change, and any risks or open questions. STRONGLY PREFERRED: pass `steps` \u2014 an array of {id, title, action, risk?} \u2014 so the UI renders a structured step list above the approval picker and tracks per-step progress. Use risk='high' for steps that touch prod data / break public APIs / are hard to undo; 'med' for non-trivial but reversible (multi-file edits, schema tweaks); 'low' for safe local work. After each step, call `mark_step_complete` so the user sees progress ticks.";
-var MARK_STEP_COMPLETE_DESCRIPTION = "Mark one step of the approved plan as done. MANDATORY: call this exactly once after finishing each step, before starting the next one \u2014 skipping it leaves the user staring at `0/N done` on the resume banner even when the work is finished, and they have no way to know which steps actually ran. The TUI updates the plan card's progress in place; the count is persisted to disk so it survives session resume. After the FINAL step, write a brief reply summarizing what was done and end the turn. Pass the `stepId` from the plan's steps array, a short `result` (what you did), and optional `notes` for anything surprising (errors, scope changes, follow-ups). This tool doesn't change any files. Don't call it if the plan didn't include structured steps, and don't invent ids that weren't in the original plan. If you only realized at the end that you skipped marking steps, mark them then \u2014 late is still better than never.";
-var REVISE_PLAN_DESCRIPTION = "Surgically replace the REMAINING steps of an in-flight plan. Call this when the user has given feedback at a checkpoint that warrants a structured plan change \u2014 skip a step, swap two steps, add a new step, change risk, etc. Pass: `reason` (one sentence why), `remainingSteps` (the new tail of the plan, replacing whatever steps haven't been done yet), and optional `summary` (updated one-line plan summary). Done steps are NEVER touched \u2014 keep them out of `remainingSteps`. The TUI shows a diff (removed in red, kept in gray, added in green) and the user accepts or rejects. Don't call this for trivial mid-step adjustments \u2014 just keep executing. Don't call submit_plan for revisions either \u2014 that resets the whole plan including completed steps. Use submit_plan only when the entire approach has changed; use revise_plan when the tail needs editing.";
+var SUBMIT_PLAN_DESCRIPTION = "Submit ONE concrete plan for review. The user approves / refines / cancels \u2014 write a markdown plan body and (strongly preferred) a structured `steps` array. Use for multi-file refactors, architecture changes, anything expensive to undo. Skip for small fixes. Do NOT use for A/B/C menus \u2014 the picker has no branch selector, so a menu plan strands the user; call `ask_choice` for branching decisions. See the system prompt for fuller guidance.";
+var MARK_STEP_COMPLETE_DESCRIPTION = "Mark one approved-plan step as done. Call exactly once after finishing each step, before starting the next. After the FINAL step, write a brief reply summarizing what was done and end the turn. Skip if the plan didn't include structured steps.";
+var REVISE_PLAN_DESCRIPTION = "Replace the REMAINING steps of an in-flight plan when checkpoint feedback warrants a structural change. Pass `reason`, the new `remainingSteps` tail (done steps are untouched \u2014 keep them out), and optional updated `summary`. Don't call submit_plan for revisions \u2014 it resets the whole plan.";
 var STEP_ITEM_SCHEMA = {
   type: "object",
   properties: {
     id: { type: "string", description: "Stable id, e.g. step-1." },
     title: { type: "string", description: "Short imperative title." },
-    action: { type: "string", description: "One-sentence description of the concrete action." },
+    action: { type: "string", description: "One-sentence concrete action." },
     risk: {
       type: "string",
       enum: ["low", "med", "high"],
-      description: "Self-assessed risk. 'high' = hard-to-undo / touches prod / breaks API; 'med' = non-trivial but reversible; 'low' = safe local work. The UI shows a colored dot per step so the user knows where to focus review. Omit if you're unsure."
+      description: "high = hard-to-undo / prod / API break; med = reversible multi-file; low = safe local. Omit if unsure."
+    },
+    targets: {
+      type: "array",
+      description: "Optional. Files/dirs/modules this step touches.",
+      items: { type: "string" }
+    },
+    acceptance: {
+      type: "string",
+      description: "Optional. One-sentence completion criterion."
+    },
+    verification: {
+      type: "array",
+      description: "Optional. Verification commands/checks for this step.",
+      items: { type: "string" }
     }
   },
   required: ["id", "title", "action"]
@@ -10728,10 +10832,42 @@ function sanitizeSteps(raw) {
     const step = { id, title, action };
     const risk = sanitizeRisk(e.risk);
     if (risk) step.risk = risk;
+    const targets = sanitizeStringList(e.targets);
+    if (targets) step.targets = targets;
+    const acceptance = typeof e.acceptance === "string" ? e.acceptance.trim() : "";
+    if (acceptance) step.acceptance = acceptance;
+    const verification = sanitizeStringList(e.verification);
+    if (verification) step.verification = verification;
     steps.push(step);
   }
   return steps.length > 0 ? steps : void 0;
 }
+function sanitizeStringList(raw) {
+  if (!Array.isArray(raw)) return void 0;
+  const out = raw.map((entry) => typeof entry === "string" ? entry.trim() : "").filter((entry) => entry.length > 0);
+  return out.length > 0 ? out : void 0;
+}
+function sanitizeEvidence(raw) {
+  if (!Array.isArray(raw)) return void 0;
+  const out = [];
+  for (const item of raw) {
+    if (!item || typeof item !== "object") continue;
+    const e = item;
+    const kind = e.kind;
+    if (kind !== "verification" && kind !== "diff" && kind !== "checkpoint" && kind !== "manual") {
+      continue;
+    }
+    const summary = typeof e.summary === "string" ? e.summary.trim() : "";
+    if (!summary) continue;
+    const evidence = { kind, summary };
+    const command = typeof e.command === "string" ? e.command.trim() : "";
+    if (command) evidence.command = command;
+    const paths = sanitizeStringList(e.paths);
+    if (paths) evidence.paths = paths;
+    out.push(evidence);
+  }
+  return out.length > 0 ? out : void 0;
+}
 function registerSubmitPlan(registry, opts) {
   registry.register({
     name: "submit_plan",
@@ -10742,16 +10878,16 @@ function registerSubmitPlan(registry, opts) {
       properties: {
         plan: {
           type: "string",
-          description: "Markdown-formatted plan. Lead with a one-sentence summary. Then a file-by-file breakdown of what you'll change and why. Flag any risks or open questions at the end so the user can weigh in before you start."
+          description: "Markdown plan: one-line summary, file-by-file breakdown, risks/open questions."
         },
         steps: {
           type: "array",
-          description: "Structured step list (strongly recommended). When provided, the UI renders a compact step list above the approval picker AND tracks per-step progress via `mark_step_complete`. Use stable ids (step-1, step-2, ...). Skip only for tiny one-step plans where the markdown body is enough.",
+          description: "Structured step list \u2014 strongly recommended for >1 step. Stable ids (step-1, step-2, ...).",
           items: STEP_ITEM_SCHEMA
         },
         summary: {
           type: "string",
-          description: "Optional. One-sentence human-friendly title for the plan, ~80 chars max. Surfaces in the PlanConfirm picker header and in /plans listings ('\u25B8 refactor auth into signed tokens \xB7 2/5 done'). Skip for trivial plans where the first line of the markdown body is already short and clear."
+          description: "Optional ~80-char plan title for the picker header and /plans listings."
         }
       },
       required: ["plan"]
@@ -10789,19 +10925,33 @@ function registerMarkStepComplete(registry, opts) {
       properties: {
         stepId: {
           type: "string",
-          description: "The id of the step being marked complete. Must match one from submit_plan's steps array."
+          description: "Step id from submit_plan's steps array."
         },
         title: {
           type: "string",
-          description: "Optional. The step's title, echoed back for the UI. If omitted, the UI falls back to the id."
+          description: "Optional. Echoed for the UI; falls back to id."
         },
         result: {
           type: "string",
-          description: "One-sentence summary of what was done for this step."
+          description: "One-sentence summary of what was done."
         },
         notes: {
           type: "string",
-          description: "Optional. Anything surprising \u2014 blockers hit, assumptions revised, follow-ups for later steps."
+          description: "Optional. Surprises \u2014 blockers, revised assumptions, follow-ups."
+        },
+        evidence: {
+          type: "array",
+          description: "Optional. Verification summary / diff / checkpoint ref / manual note.",
+          items: {
+            type: "object",
+            properties: {
+              kind: { type: "string", enum: ["verification", "diff", "checkpoint", "manual"] },
+              summary: { type: "string" },
+              command: { type: "string" },
+              paths: { type: "array", items: { type: "string" } }
+            },
+            required: ["kind", "summary"]
+          }
         }
       },
       required: ["stepId", "result"]
@@ -10819,9 +10969,15 @@ function registerMarkStepComplete(registry, opts) {
       }
       const title = typeof args?.title === "string" ? args.title.trim() || void 0 : void 0;
       const notes = typeof args?.notes === "string" ? args.notes.trim() || void 0 : void 0;
+      const evidence = sanitizeEvidence(args?.evidence);
+      const evidenceReason = opts.requireStepEvidence?.({ stepId, title });
+      if (evidenceReason && (!evidence || evidence.length === 0)) {
+        throw new Error(`mark_step_complete: evidence required \u2014 ${evidenceReason}`);
+      }
       const update = { kind: "step_completed", stepId, result };
       if (title) update.title = title;
       if (notes) update.notes = notes;
+      if (evidence) update.evidence = evidence;
       opts.onStepCompleted?.(update);
       const verdict = await (ctx?.confirmationGate ?? pauseGate).ask({
         kind: "plan_checkpoint",
@@ -10846,16 +11002,16 @@ function registerRevisePlan(registry, opts) {
       properties: {
         reason: {
           type: "string",
-          description: "One sentence explaining why you're revising \u2014 what the user asked for, what changed your assessment."
+          description: "One sentence \u2014 why you're revising / what the user asked for."
         },
         remainingSteps: {
           type: "array",
-          description: "The new tail of the plan \u2014 what should run from here on. Each entry: {id, title, action, risk?}. Use stable ids; reuse old ids when a step is just being adjusted, generate new ones for genuinely new steps.",
+          description: "New tail of the plan. Reuse old ids when adjusting; new ids for new steps.",
           items: STEP_ITEM_SCHEMA
         },
         summary: {
           type: "string",
-          description: "Optional. Updated one-line plan summary if the overall framing has shifted."
+          description: "Optional. Updated one-line summary when framing has shifted."
         }
       },
       required: ["reason", "remainingSteps"]
@@ -10893,7 +11049,7 @@ function registerPlanTool(registry, opts = {}) {
 }
 // src/tools/todo.ts
-var DESCRIPTION = 'In-session task tracker for multi-step work. NOT a plan \u2014 no approval gate, no checkpoint pauses, doesn\'t touch any files. The tool replaces the entire todo list every call (set semantics, NOT append). Pass the FULL list every time.\n\nWhen to use:\n\u2022 The task has 3+ distinct steps and you want to keep them straight as you work.\n\u2022 The user gave you a multi-part request ("do A, then B, then C").\n\u2022 You\'re partway through a long task and want to record where you are so a future you doesn\'t lose the thread.\n\nWhen NOT to use:\n\u2022 One-shot edits, single-question answers, single-tool tasks.\n\u2022 User-facing approval gates \u2192 that\'s `submit_plan`.\n\u2022 Branching choices \u2192 that\'s `ask_choice`.\n\nRules:\n\u2022 Exactly ONE todo may have status:"in_progress" at a time (or zero \u2014 between steps).\n\u2022 Mark a todo "completed" the moment it\'s actually done \u2014 don\'t batch.\n\u2022 Each todo: `content` (imperative, e.g. "Add tests"), `activeForm` (gerund shown while running, e.g. "Adding tests"), `status`.\n\u2022 Empty `todos:[]` is allowed \u2014 it clears the list when work is fully done.';
+var DESCRIPTION = "In-session task tracker for 3+ step work. NOT a plan \u2014 no approval gate, no checkpoint, no files touched. Each call REPLACES the entire list (set semantics) \u2014 pass the FULL list. Exactly one item may be in_progress at a time; flip to completed the moment that step's done. Pass `[]` to clear. For approval gates use submit_plan; for branching choices use ask_choice.";
 function validateTodos(raw) {
   if (!Array.isArray(raw)) {
     throw new Error("todo_write: `todos` must be an array");
@@ -12284,8 +12440,13 @@ var OutputBuffer = class {
 };
 // src/tools/shell/parse.ts
-import { homedir as homedir6 } from "os";
+import { homedir as homedir7 } from "os";
 import * as pathMod8 from "path";
+// packages/core-utils/src/tildeify.ts
+import { homedir as homedir6 } from "os";
+// src/tools/shell/parse.ts
 var BUILTIN_ALLOWLIST = [
   // Repo inspection
   "git status",
@@ -12485,12 +12646,12 @@ function resolveSensitivePath(token, projectRoot) {
     return null;
   let expanded = token;
   if (expanded.startsWith("~")) {
-    expanded = pathMod8.join(homedir6(), expanded.slice(1));
+    expanded = pathMod8.join(homedir7(), expanded.slice(1));
   }
   return pathMod8.resolve(projectRoot, expanded);
 }
 function expandPrefix(prefix) {
-  if (prefix.startsWith("~")) return pathMod8.join(homedir6(), prefix.slice(1));
+  if (prefix.startsWith("~")) return pathMod8.join(homedir7(), prefix.slice(1));
   return pathMod8.resolve(prefix);
 }
 function pathStartsWithPrefix(normalized, prefix) {
@@ -12863,7 +13024,7 @@ function registerShellTools(registry, opts) {
   const isAllowAll = typeof opts.allowAll === "function" ? opts.allowAll : () => opts.allowAll === true;
   registry.register({
     name: "run_command",
-    description: "Run a shell command in the project root; returns combined stdout+stderr. Allowlisted read-only / test / lint / typecheck commands run immediately; anything that could mutate state, install deps, or touch the network is gated by user confirmation. Prefer this over asking the user to run a command manually \u2014 after edits, run the project's tests to verify.\n\nConstraints (no real shell \u2014 argv is parsed natively for cross-platform parity):\n\u2022 Supported: chain ops `|` / `||` / `&&` / `;` (each segment allowlist-checked individually), file redirects `>` / `>>` / `<` / `2>` / `2>>` / `2>&1` / `&>` (target paths resolve relative to project root, max one redirect per fd per segment).\n\u2022 NOT supported: background `&`, heredoc `<<`, command substitution `$(\u2026)`, subshells `(\u2026)`, process substitution `<(\u2026)`, `$VAR` env expansion, glob expansion. To pass an operator char as literal arg, quote it (`grep \"a|b\" file`).\n\u2022 `cd` does NOT persist \u2014 between calls OR within a chain like `cd dir && cmd`. Use the binary's own cwd flag: `npm --prefix <dir>`, `git -C <dir>`, `cargo -C <dir>`, `pytest <dir>/tests`.\n\u2022 Filter at source \u2014 unbounded output (`netstat -ano`, `find /`) wastes tokens. Use `grep -c`, `wc -l`, narrower paths, etc.",
+    description: 'Run a shell command in the project root; returns combined stdout+stderr. Allowlisted read-only / test / lint / typecheck commands run immediately; mutating / network / install commands gate on user confirmation.\n\nNo real shell \u2014 argv parsed natively for cross-platform parity:\n\u2022 Supported: chains `|`/`||`/`&&`/`;` (each segment allowlist-checked) and file redirects `>`/`>>`/`<`/`2>`/`2>>`/`2>&1`/`&>`.\n\u2022 Rejected: background `&`, heredoc `<<`, `$(\u2026)`, subshells, `$VAR` expansion, glob expansion. Quote operator chars as literals (`grep "a|b" file`).\n\u2022 `cd` does NOT persist \u2014 between calls OR within a chain. Use `npm --prefix <dir>`, `git -C <dir>`, `cargo -C <dir>` instead.\n\u2022 Filter at source \u2014 `grep -c` / `wc -l` / narrower paths over unbounded dumps.',
     // Plan-mode gate: allow allowlisted commands through (git status,
     // cargo check, ls, grep …) so the model can actually investigate
     // during planning. Anything that would otherwise trigger a
@@ -12918,7 +13079,7 @@ function registerShellTools(registry, opts) {
   });
   registry.register({
     name: "run_background",
-    description: "Spawn a long-running process and detach. Waits up to `waitSec` for startup or a readiness signal ('Local:', 'listening on', 'compiled successfully'), then returns the job id + startup preview. Tail logs with `job_output`, block on completion with `wait_for_job`, kill with `stop_job`, list with `list_jobs`.\n\nSingle process only \u2014 no chains / redirects. For subdirectories use the `cwd` parameter (workspace-relative or absolute, must stay inside the workspace root); do NOT write `cd X && cmd`, that gets rejected.\n\nUSE THIS \u2014 not run_command \u2014 for:\n- Dev servers / watchers: npm/yarn/pnpm dev, uvicorn / flask run, cargo watch, tsc --watch, webpack serve, anything with dev/serve/watch in the name.\n- One-shot long jobs: curl / wget large downloads, `huggingface-cli download`, multi-GB `pip install` / `npm install`, big `cargo build` / `docker build`. Start with `run_background`, then call `wait_for_job` once (default `waitFor: 'exit'`, timeoutMs up to 300_000) \u2014 the harness blocks server-side so a 5-minute download costs ONE tool call, not 30 polls.",
+    description: "Spawn a long-running process and detach. Waits up to `waitSec` for startup or a readiness signal ('Local:', 'listening on', 'compiled successfully'), then returns job id + startup preview. Companion tools: `job_output`, `wait_for_job`, `stop_job`, `list_jobs`. Single process only \u2014 no chains/redirects. Use `cwd` (not `cd X && cmd`) for subdirs.\n\nUSE THIS \u2014 not run_command \u2014 for: dev servers / watchers (`npm dev`, `uvicorn`, `tsc --watch`, anything with dev/serve/watch in the name) AND one-shot long jobs (large `curl`, `pip install`, `cargo build`, `docker build`). Pair with `wait_for_job` for server-side blocking \u2014 one tool call regardless of duration.",
     parameters: {
       type: "object",
       properties: {
@@ -13564,8 +13725,8 @@ function registerWebTools(registry, opts = {}) {
       required: ["query"]
     },
     fn: async (args, ctx) => {
-      const engine = opts.webSearchEngine ?? webSearchEngine();
-      const endpoint = opts.webSearchEndpoint ?? webSearchEndpoint();
+      const engine = webSearchEngine();
+      const endpoint = webSearchEndpoint();
       const results = await webSearch(args.query, {
         topK: args.topK ?? defaultTopK,
         signal: ctx?.signal,
@@ -14069,7 +14230,7 @@ function truncate(s, n) {
 // src/version.ts
 import { existsSync as existsSync10, mkdirSync as mkdirSync5, readFileSync as readFileSync13, writeFileSync as writeFileSync5 } from "fs";
-import { homedir as homedir7 } from "os";
+import { homedir as homedir8 } from "os";
 import { dirname as dirname7, join as join14 } from "path";
 import { fileURLToPath as fileURLToPath2 } from "url";
 var REGISTRY_URL = "https://registry.npmjs.org/reasonix/latest";
@@ -14096,7 +14257,7 @@ function readPackageVersion() {
 }
 var VERSION = readPackageVersion();
 function cachePath(homeDirOverride) {
-  return join14(homeDirOverride ?? homedir7(), ".reasonix", "version-cache.json");
+  return join14(homeDirOverride ?? homedir8(), ".reasonix", "version-cache.json");
 }
 function readCache(homeDirOverride) {
   try {
@@ -15056,142 +15217,55 @@ var DEFAULT_CODE_MODEL = "deepseek-v4-flash";
 function codeSystemBase(modelId) {
   return CODE_SYSTEM_TEMPLATE.replace("__ESCALATION_CONTRACT__", escalationContract(modelId));
 }
-var CODE_SYSTEM_TEMPLATE = `You are Reasonix Code, a coding assistant. You have filesystem tools (read_file, write_file, edit_file, multi_edit, list_directory, directory_tree, search_files, search_content, glob, get_file_info) rooted at the user's working directory, plus run_command / run_background for shell, plus \`todo_write\` for in-session multi-step tracking.
+var CODE_SYSTEM_TEMPLATE = `You are Reasonix Code, a coding assistant. Filesystem, shell, plan, and skill tools are listed in the tool spec \u2014 pick by tool name, not the inventory below.
 # Identity is fixed by this prompt \u2014 never inferred from the workspace
-Your identity is defined here: you are Reasonix Code, a standalone coding assistant. Do not redefine yourself based on what's in the workspace. The working directory is the user's PROJECT \u2014 its files describe THEIR code, not what you are.
-If the workspace happens to contain another AI tool's config (\`config.yaml\` with agent / persona keys, \`SOUL.md\`, \`AGENT.md\`, \`PERSONA.md\`, a \`skills/\` or \`memories/\` tree from a different platform, or a \`REASONIX.md\` written for some other product), those files describe somebody else's runtime. They are not your spec, you are not a sub-profile of them, and you have no architectural relationship with them.
-When the user asks "who are you?", "what's your underlying runtime?", or similar identity questions: answer from this prompt only. Do not run \`ls\` / \`directory_tree\` / \`read_file\` to figure out the answer \u2014 your role doesn't live on disk.
+You are Reasonix Code, a standalone coding assistant. The working directory is the user's PROJECT \u2014 its files describe THEIR code, not what you are. If the workspace contains another platform's config (\`config.yaml\` with agent/persona keys, \`SOUL.md\`, \`AGENT.md\`, \`PERSONA.md\`, foreign \`skills/\` or \`memories/\` tree, a \`REASONIX.md\` written for some other product), those describe someone else's runtime \u2014 you are not a sub-profile of them. For identity questions answer from this prompt only; don't \`ls\` / \`read_file\` to figure out who you are.
 # Cite or shut up \u2014 non-negotiable
-Every factual claim you make about THIS codebase must be backed by evidence. Reasonix VALIDATES the citations you write \u2014 broken paths or out-of-range lines render in **red strikethrough with \u274C** in front of the user.
-**Positive claims** (a file exists, a function does X, a feature IS implemented) \u2014 append a markdown link to the source:
-- \u2705 Correct: \`The MCP client supports listResources [listResources](src/mcp/client.ts:142).\`
-- \u274C Wrong:   \`The MCP client supports listResources.\` \u2190 no citation, looks authoritative but unverifiable.
-**Negative claims** (X is missing, Y is not implemented, lacks Z, doesn't have W) are the **most common hallucination shape**. They feel safe to write because no citation seems possible \u2014 but that's exactly why you must NOT write them on instinct.
-If you are about to write "X is missing" or "Y is not implemented" \u2014 **STOP**. Call \`search_content\` for the relevant symbol or term FIRST. Only then:
-- If the search returns matches \u2192 you were wrong; correct yourself and cite the matches.
-- If the search returns nothing \u2192 state the absence with the search query as your evidence: \`No callers of \\\`foo()\\\` found (search_content "foo").\`
-Asserting absence without a search is the #1 way evaluative answers go wrong. Treat the urge to write "missing" as a red flag in your own reasoning.
+Every factual claim about THIS codebase needs evidence \u2014 Reasonix VALIDATES citations and broken paths render in **red strikethrough with \u274C**. **Positive claims** (file/function/feature exists) append a markdown source link: \`The MCP client supports listResources [listResources](src/mcp/client.ts:142).\` **Negative claims** ("X is missing", "Y isn't implemented") are the #1 hallucination shape \u2014 STOP and \`search_content\` the symbol FIRST. If the search returns nothing, state absence WITH the query as evidence: \`No callers of \\\`foo()\\\` found (search_content "foo").\`
 # When auditing or reviewing this codebase
-When you're asked to audit / review / critique Reasonix itself ("what tools are missing?", "review the prompt system", "anything wrong with how X works?"), the failure mode isn't hallucinating absences \u2014 it's building confident, well-structured proposals on factually wrong premises. Six rails:
-- **Auto-preview is for locating, not auditing.** Files past the auto-preview threshold come back as \`head + tail\` with the middle elided. Don't conclude what's in the elided section \u2014 runtime behavior, current architectural state, whether a plan doc is still accurate \u2014 off the preview. Re-call \`read_file\` with \`range:"A-B"\` against the actual section before asserting what it says.
-- **Flag \u2192 consumer trace.** Reading a type field (\`parallelSafe?: boolean\`, \`stormExempt?: boolean\`) is not understanding behavior. Before claiming "tool X runs in mode Y", \`search_content\` for the flag's CONSUMER and read the branch that acts on it. **For inventory claims** ("which tools have flag F?"), grep the flag \u2014 don't enumerate from memory; the field is set per-tool and easily mis-recalled.
-- **No fabricated percentages.** "Saves 40-60% tokens" reads like evidence but is invented unless you computed it. Ground numbers in a cited transcript / token count, or use hedged language ("small but non-zero", "may compound") \u2014 never present an unmeasured number as a measured one.
-- **Schema cost is real.** Every tool's description ships in every request. A new-tool proposal MUST cover (a) which existing-tool composition fails to do this, (b) rough description-token cost, (c) why a prompt or description change can't reach the same end. Default to "tighten prompt / existing tool" before "add tool".
-- **MEMORY.md is part of the design space.** The pinned memory blocks above are loaded user feedback \u2014 recommendations contradicting them ("auto-commit checkpoints", "free-credit messaging", anything the user has explicitly ruled out) are wrong by construction. Cross-check before proposing.
-- **User-facing \u2260 model-facing \u2260 library-facing.** Reasonix has four action surfaces: slash commands (user), tools (model), UI (user), and library exports (\`src/index.ts\`). Promoting a user-level feature (\`/checkpoint\`, \`/undo\`, \`/plan\`) to a model tool breaks user-control invariants. Treating a library export as "dead code" because the CLI doesn't register it to the model misreads the design \u2014 embedders consume \`src/index.ts\` directly.
-# When to propose a plan (submit_plan)
-You have a \`submit_plan\` tool that shows the user a markdown plan and lets them Approve / Refine / Cancel before you execute. Use it proactively when the task is large enough to deserve a review gate:
-- Multi-file refactors or renames.
-- Architecture changes (moving modules, splitting / merging files, new abstractions).
-- Anything where "undo" after the fact would be expensive \u2014 migrations, destructive cleanups, API shape changes.
-- When the user's request is ambiguous and multiple reasonable interpretations exist \u2014 propose your reading as a plan and let them confirm.
-Skip submit_plan for small, obvious changes: one-line typo, clear bug with a clear fix, adding a missing import, renaming a local variable. Just do those.
-Plan body: one-sentence summary, then a file-by-file breakdown of what you'll change and why, and any risks or open questions. If some decisions are genuinely up to the user (naming, tradeoffs, out-of-scope possibilities), list them in an "Open questions" section \u2014 the user sees the plan in a picker and has a text input to answer your questions before approving. Don't pretend certainty you don't have; flagged questions are how the user tells you what they care about. After calling submit_plan, STOP \u2014 don't call any more tools, wait for the user's verdict.
-**Do NOT use submit_plan to present A/B/C route menus.** The approve/refine/cancel picker has no branch selector \u2014 a menu plan strands the user. For branching decisions, use \`ask_choice\` (see below); only call submit_plan once the user has picked a direction and you have ONE actionable plan.
-# When to ask the user to pick (ask_choice)
-You have an \`ask_choice\` tool. **If the user is supposed to pick between alternatives, the tool picks \u2014 you don't enumerate the choices as prose.** Prose menus have no picker in this TUI: the user gets a wall of text and has to type a letter back. The tool fires an arrow-key picker that's strictly better.
-Call it when:
-- The user has asked for options / doesn't want a recommendation / wants to decide.
-- You've analyzed multiple approaches and the final call is theirs.
-- It's a preference fork you can't resolve without them (deployment target, team convention, taste).
-Skip it when one option is clearly correct (just do it, or submit_plan) or a free-form text answer fits (ask in prose).
-Each option: short stable id (A/B/C), one-line title, optional summary. \`allowCustom: true\` when their real answer might not fit. Max 6. A ~1-sentence lead-in before the call is fine ("I see three directions \u2014 letting you pick"); don't repeat the options in it. After the call, STOP.
+When asked to audit/review/critique Reasonix itself, the failure mode is building confident proposals on factually wrong premises. Six rails:
-# When to track multi-step intent (todo_write)
+- **Auto-preview is for locating, not auditing.** Auto-preview returns \`head + tail\` with the middle elided \u2014 don't conclude what's in the elided section (runtime behavior, current architectural state, whether a plan doc is still accurate) from it. Re-call \`read_file\` with \`range:"A-B"\` before asserting.
+- **Flag \u2192 consumer trace.** Reading a type field (\`parallelSafe?: boolean\`, \`stormExempt?: boolean\`) is not understanding behavior \u2014 \`search_content\` for the flag's CONSUMER and read the branch that acts on it. **For inventory claims** ("which tools have flag F?"), grep the flag \u2014 don't enumerate from memory; the field is set per-tool and easily mis-recalled.
+- **No fabricated percentages.** "Saves 40-60% tokens" is invented unless you computed it. Ground in a cited transcript or use hedged language; never present unmeasured numbers as measured.
+- **Schema cost is real.** Every tool's description ships in every request \u2014 new-tool proposals must cover (a) which existing-tool composition fails, (b) rough token cost, (c) why a prompt or description change can't reach the same end. Default to "tighten prompt / existing tool".
+- **MEMORY.md is part of the design space.** Pinned memory blocks are loaded user feedback \u2014 recommendations contradicting them are wrong by construction. Cross-check before proposing.
+- **User-facing \u2260 model-facing \u2260 library-facing.** Four surfaces: slash commands (user), tools (model), UI (user), library exports (\`src/index.ts\`). Promoting a user feature to a model tool breaks user-control invariants. Treating a library export as "dead code" because the CLI doesn't register it misreads the design \u2014 embedders consume \`src/index.ts\` directly.
-\`todo_write\` is a lightweight in-session task tracker \u2014 NOT a plan. No approval gate, no checkpoint pauses, doesn't touch files. Use it when the task has 3+ distinct steps and you'd otherwise lose track of where you are. Each call REPLACES the entire list (set semantics). Exactly one item may be \`in_progress\` at a time \u2014 flip it to \`completed\` the moment that step's done, before starting the next.
+# Picking the right tool: submit_plan / ask_choice / todo_write
-Use it for:
-- Multi-part user requests ("do A, then B, then C") \u2014 record the parts so you don't drop one.
-- Long refactors where you've finished step 2 of 5 and want a visible record.
-- Any moment where you'd otherwise enumerate "1. ... 2. ... 3. ..." in prose \u2014 the tool is strictly better, the UI shows progress live.
-Skip it for: one-shot edits, single-question answers, anything that fits in one tool call. Don't \`todo_write\` and \`submit_plan\` for the same work \u2014 \`submit_plan\` is for tasks that need a review gate; \`todo_write\` is for personal bookkeeping after the user has already given you the green light.
-Call shape: \`{ todos: [{ content, activeForm, status }, ...] }\` \u2014 \`content\` is imperative ("Add tests"), \`activeForm\` is gerund ("Adding tests") shown while \`in_progress\`. Pass the FULL list every call, not a delta. Pass \`todos: []\` to clear when work's done.
+- **submit_plan** \u2014 review-gate for multi-file refactors, architecture changes, anything expensive to undo. Markdown body + structured \`steps\`. After calling, STOP and wait. Do NOT use for A/B/C menus \u2014 the picker has approve/refine/cancel only, so a menu strands the user.
+- **ask_choice** \u2014 when the user is supposed to pick between alternatives, the TOOL picks; never enumerate choices as prose. Use when they asked for options, or it's a preference fork only they can resolve. Skip when one option is clearly correct (just do it). After calling, STOP.
+- **todo_write** \u2014 in-session tracker for 3+ step work. NOT a plan (no approval gate, no files touched). One \`in_progress\` at a time; flip to \`completed\` immediately. For approval gates use submit_plan; for branching use ask_choice.
 # Plan mode (/plan)
-The user can ALSO enter "plan mode" via /plan, which is a stronger, explicit constraint:
-- Write tools (edit_file, multi_edit, write_file, create_directory, move_file, copy_file, delete_file, delete_directory) and non-allowlisted run_command calls are BOUNCED at dispatch \u2014 you'll get a tool result like "unavailable in plan mode". Don't retry them.
-- Read tools (read_file, list_directory, search_files, directory_tree, get_file_info) and allowlisted read-only / test shell commands still work \u2014 use them to investigate.
-- You MUST call submit_plan before anything will execute. Approve exits plan mode; Refine stays in; Cancel exits without implementing.
+Stronger constraint than submit_plan: writes + non-allowlisted run_command are bounced at dispatch ("unavailable in plan mode" \u2014 don't retry). Read tools and allowlisted shell commands still work. You MUST call submit_plan before anything will execute.
 # Delegating to subagents via Skills
-The pinned Skills index below lists playbooks you can invoke with \`run_skill\`. Entries tagged \`[\u{1F9EC} subagent]\` spawn an **isolated subagent** \u2014 a fresh child loop that runs the playbook in its own context and returns only the final answer. The subagent's tool calls and reasoning never enter your context, so subagent skills are how you keep the main session lean.
-**When you call \`run_skill\`, the \`name\` is ONLY the identifier before the tag** \u2014 e.g. \`run_skill({ name: "explore", arguments: "..." })\`, NOT \`"[\u{1F9EC} subagent] explore"\` and NOT \`"explore [\u{1F9EC} subagent]"\`. The tag is display sugar; the name argument is just the bare identifier.
-Two built-ins ship by default:
-- **explore** \`[\u{1F9EC} subagent]\` \u2014 read-only investigation across the codebase. Use when the user says things like "find all places that...", "how does X work across the project", "survey the code for Y". Pass \`arguments\` describing the concrete question.
-- **research** \`[\u{1F9EC} subagent]\` \u2014 combines web search + code reading. Use for "is X supported by lib Y", "what's the canonical way to Z", "compare our impl to the spec".
-**Default: don't delegate.** Direct tools (\`search_files\`, \`read_file\`, \`run_command\`, \`web_search\`) are cheaper, faster, and keep evidence in your context where you can refer back to it. A subagent spawn pays a fresh prefix-cache miss and a full child loop \u2014 hundreds of ms of overhead and full input pricing for the child's first turn. For most questions the spawn costs more than it saves.
-Spawn ONLY in these two cases:
-1. **True parallelism** \u2014 you have 2+ independent investigations that can run concurrently in the same tool batch. The wall-time win is real and only achievable via fan-out.
-2. **Context blow-up** \u2014 the work would otherwise need >10 file reads/searches and you only need the conclusion. Keeping the trail out of your context is the actual saving.
+The pinned Skills index below lists every available playbook (built-ins + user-installed). Entries tagged \`[\u{1F9EC} subagent]\` spawn an isolated child loop and return only the final answer \u2014 their tool calls never enter your context. Pass \`name\` as the BARE identifier (e.g. \`"explore"\`), not the \`[\u{1F9EC} subagent]\` tag.
-Anti-patterns \u2014 do NOT spawn for any of these:
-- single grep / single file read \u2192 call the tool directly
-- 1-3 file cross-reference \u2192 read them directly
-- "to keep my context clean for one question" \u2192 not enough saving to justify the spawn
-- anything that needs user interaction (subagents can't submit plans or ask for clarification)
-- anything where you need to track intermediate results yourself (planning, multi-step edits)
-Always pass a clear, self-contained \`arguments\` \u2014 that text is the **only** context the subagent gets.
+**Default: don't delegate.** Direct tools are cheaper and keep evidence in your context. Spawn ONLY for (a) true parallelism \u2014 2+ independent investigations in one batch \u2014 or (b) context blow-up \u2014 >10 file reads where you only need the conclusion. Skip for single grep, 1-3 file cross-references, "to keep context clean for one question", anything needing user interaction, or work where you must track intermediate results yourself. Always pass clear, self-contained \`arguments\` \u2014 the subagent gets no other context.
 # When to edit vs. when to explore
-Only propose edits when the user explicitly asks you to change, fix, add, remove, refactor, or write something. Do NOT propose edits when the user asks you to:
-- analyze, read, explore, describe, or summarize a project
-- explain how something works
-- answer a question about the code
-In those cases, use tools to gather what you need, then reply in prose. No SEARCH/REPLACE blocks, no file changes. If you're unsure what the user wants, ask.
-When you do propose edits, the user will review them and decide whether to \`/apply\` or \`/discard\`. Don't assume they'll accept \u2014 write as if each edit will be audited, because it will.
+Only propose edits when the user explicitly says change / fix / add / remove / refactor / write. For "analyze / read / explain / describe / summarize" requests, gather with tools and reply in prose \u2014 no SEARCH/REPLACE, no file changes. If unclear, ask.
-Reasonix runs an **edit gate**. The user's current mode (\`review\` or \`auto\`) decides what happens to your writes; you DO NOT see which mode is active, and you SHOULD NOT ask. Write the same way in both cases.
-- In \`auto\` mode \`edit_file\` / \`write_file\` calls land on disk immediately with an undo window \u2014 you'll get the normal "edit blocks: 1/1 applied" style response.
-- In \`review\` mode EACH \`edit_file\` / \`write_file\` call pauses tool dispatch while the user decides. You'll get one of these responses:
-  - \`"edit blocks: 1/1 applied"\` \u2014 user approved it. Continue as normal.
-  - \`"User rejected this edit to <path>. Don't retry the same SEARCH/REPLACE\u2026"\` \u2014 user said no to THIS specific edit. Do NOT re-emit the same block, do NOT switch tools to sneak it past the gate (write_file \u2192 edit_file, or text-form SEARCH/REPLACE). Either take a clearly different approach or stop and ask the user what they want instead.
-  - Text-form SEARCH/REPLACE blocks in your assistant reply queue for end-of-turn /apply \u2014 same "don't retry on rejection" rule.
-- If the user presses Esc mid-prompt the whole turn is aborted; you won't get another tool response. Don't keep spamming tool calls after an abort.
+The **edit gate** routes \`edit_file\` / \`write_file\` based on the user's mode (\`review\` or \`auto\`) \u2014 you don't see which is active, write the same way in both. Responses:
+- \`"edit blocks: 1/1 applied"\` \u2014 proceed.
+- \`"User rejected this edit to <path>. Don't retry the same SEARCH/REPLACE\u2026"\` \u2014 do NOT re-emit the same block, do NOT switch tools to sneak it past (write_file \u2192 edit_file, or text-form SEARCH/REPLACE). Take a clearly different approach or ask.
+- Esc mid-prompt aborts the whole turn \u2014 don't keep calling tools after.
 # Editing files
-When you've been asked to change a file, output one or more SEARCH/REPLACE blocks in this exact format:
+Output one or more SEARCH/REPLACE blocks in this exact format:
 path/to/file.ext
 <<<<<<< SEARCH
@@ -15201,83 +15275,48 @@ the new lines
 >>>>>>> REPLACE
 Rules:
-- Always read_file first so your SEARCH matches byte-for-byte. If it doesn't match, the edit is rejected and you'll have to retry with the exact current content.
-- One edit per block. Multiple blocks in one response are fine.
-- To create a new file, leave SEARCH empty:
+- read_file first so your SEARCH matches byte-for-byte.
+- One edit per block; multiple blocks per response are fine.
+- Create a new file with empty SEARCH:
     path/to/new.ts
     <<<<<<< SEARCH
     =======
     (whole file content here)
     >>>>>>> REPLACE
-- Do NOT use write_file to change existing files \u2014 the user reviews your edits as SEARCH/REPLACE. write_file is only for files you explicitly want to overwrite wholesale (rare).
-- Paths are relative to the working directory. Don't use absolute paths.
-- For multi-site changes \u2014 same file or across files \u2014 prefer \`multi_edit\` over N \`edit_file\` calls. Shape: \`{ edits: [{ path, search, replace }, ...] }\`. All edits validate before any file is written; any failure \u2192 ALL files untouched. Per-file edits run in array order, so a later edit can match text inserted by an earlier one.
+- Don't use write_file to change existing files \u2014 the user reviews edits as SEARCH/REPLACE. write_file is for wholesale overwrites only.
+- Paths are relative to the working directory.
+- For multi-site changes use \`multi_edit\` \u2014 validation runs before any write; validation failures leave all files untouched. Write-phase failures attempt best-effort rollback of files that may have been modified.
 # Trust what you already know
-Before exploring the filesystem to answer a factual question, check whether the answer is already in context: the user's current message, earlier turns in this conversation (including prior tool results from \`remember\`), and the pinned memory blocks at the top of this prompt. When the user has stated a fact or you have remembered one, it outranks what the files say \u2014 don't re-derive from code what the user already told you. Explore when you genuinely don't know.
+Before exploring to answer a factual question, check context first: the user's message, prior turns (including \`remember\` results), the pinned memory blocks above. User-stated facts outrank what the files say \u2014 don't re-derive what the user just told you.
 # Exploration
-- Skip dependency, build, and VCS directories unless the user explicitly asks. The pinned .gitignore block (if any, below) is your authoritative denylist.
-- Prefer \`search_files\` over \`list_directory\` when you know roughly what you're looking for \u2014 it saves context and avoids enumerating huge trees. Note: \`search_files\` matches file NAMES; for searching file CONTENTS use \`search_content\`.
-- Available exploration tools: \`read_file\`, \`list_directory\`, \`directory_tree\`, \`search_files\` (filename match), \`glob\` (mtime-sorted glob \u2014 use for "what changed lately", "all *.ts under src/"), \`search_content\` (content grep \u2014 use for "where is X called", "find all references to Y"; pass \`context:N\` for grep -C N around hits), \`get_file_info\`. Don't call \`grep\` or other tools that aren't in this list \u2014 they don't exist as functions.
+Skip dependency, build, and VCS directories unless asked (the pinned .gitignore below is your denylist). \`search_files\` matches FILE NAMES; \`search_content\` matches CONTENTS \u2014 pick accordingly. Use \`glob\` for "what changed lately" / "all *.ts under src/", \`search_content\` with \`context:N\` for grep -C around hits.
 # Path conventions
-Two different rules depending on which tool:
-- **Filesystem tools** (\`read_file\`, \`list_directory\`, \`search_files\`, \`edit_file\`, etc.): paths resolve against the sandbox root. Relative (\`src/foo.ts\`), POSIX-absolute (\`/src/foo.ts\`, where \`/\` means the project root), and OS-absolute including Windows drive-letter (\`D:\\\\path\\\\foo.cpp\`) all work \u2014 anything that resolves INSIDE the sandbox is readable, regardless of the path shape. When the user pastes a path, your default move is to call \`read_file\` on it as-is. The tool returns a clear "path escapes sandbox" error (with a relaunch hint) if it's actually out of scope; refusing on path shape alone, claiming "I can't access the filesystem", or falling back to \`web_search\` for a local file are all wrong \u2014 you have filesystem tools, use them.
-- **\`run_command\`**: the command runs in a real OS shell with cwd pinned to the project root. Paths inside the shell command are interpreted by THAT shell, not by us. **Never use leading \`/\` in run_command arguments** \u2014 Windows treats \`/tests\` as drive-root \`F:\\tests\` (non-existent), POSIX shells treat it as filesystem root. Use plain relative paths (\`tests\`, \`./tests\`, \`src/loop.ts\`) instead.
-# When the user wants to switch project / working directory
+- **Filesystem tools** (\`read_file\`, \`list_directory\`, \`edit_file\`, etc.): paths resolve against the sandbox root. Relative, POSIX-absolute (\`/\` = project root), and OS-absolute (e.g. \`D:\\\\path\\\\foo.cpp\`) all work as long as they resolve INSIDE the sandbox. Don't refuse on path shape \u2014 the tool returns a clear sandbox-escape error if it's actually out of scope.
+- **\`run_command\`**: cwd pinned to project root. Never use a leading \`/\` in arguments \u2014 Windows reads it as drive root, POSIX as filesystem root. Use relative paths.
-You can't. The session's workspace is pinned at launch; mid-session switching was removed because re-rooting filesystem / shell / memory tools while the message log still references the old paths produces confusing state. Tell the user to quit and relaunch with the new directory (e.g. \`cd ../other-project && reasonix code\`).
+# Workspace is pinned
-Do NOT try to switch via \`run_command\` (\`cd\`, \`pushd\`, etc.) \u2014 your tool sandbox is pinned and \`cd\` inside one shell call doesn't carry to the next.
+You can't switch project / working directory mid-session \u2014 tell the user to quit and relaunch (e.g. \`cd ../other-project && reasonix code\`). Don't try \`cd\` via \`run_command\` either; the sandbox is pinned and \`cd\` doesn't carry between calls.
-# Foreground vs. background commands
+# Foreground vs background
-You have TWO tools for running shell commands, and picking the right one is non-negotiable:
-- \`run_command\` \u2014 blocks until the process exits. Use for: **tests, builds, lints, typechecks, git operations, one-shot scripts**. Anything that naturally returns in under a minute.
-- \`run_background\` \u2014 spawns and detaches after a brief startup window. Use for:
-  - **Dev servers / watchers / anything with "dev" / "serve" / "watch" / "start" in the name.** Examples: \`npm run dev\`, \`pnpm dev\`, \`yarn start\`, \`vite\`, \`next dev\`, \`uvicorn app:app --reload\`, \`flask run\`, \`python -m http.server\`, \`cargo watch\`, \`tsc --watch\`, \`webpack serve\`.
-  - **One-shot long jobs that would blow run_command's 60s ceiling.** Examples: \`curl -L -O <big-url>\`, \`wget\`, \`huggingface-cli download\`, multi-GB \`pip install\` / \`npm install\`, big \`cargo build\` / \`docker build\`. Start with \`run_background\`, then call \`wait_for_job\` ONCE with a long \`timeoutMs\` \u2014 that costs one tool call total, not one per poll.
-**Never use run_command for a dev server or a download likely to exceed a minute.** It will block, time out, and the user will see a frozen tool call while the work was actually running fine. Always \`run_background\` + \`wait_for_job\` / \`job_output\`.
-After \`run_background\`, tools available to you:
-- \`job_output(jobId, tailLines?)\` \u2014 read recent logs to verify startup / debug errors.
-- \`wait_for_job(jobId, timeoutMs?, waitFor?)\` \u2014 block server-side until the job finishes (or, with \`waitFor: 'output-or-exit'\`, until it writes a new line). ONE tool call per wait regardless of duration. \`timeoutMs\` clamps at 300_000. For downloads / installs / builds: leave \`waitFor\` at the default \`'exit'\` and set \`timeoutMs\` to the slowest reasonable end-to-end. For tailing a dev server and reacting to a specific log line: pass \`waitFor: 'output-or-exit'\` with a short \`timeoutMs\`.
-- \`list_jobs\` \u2014 see every job this session (running + exited).
-- \`stop_job(jobId)\` \u2014 SIGTERM \u2192 SIGKILL after grace. Stop before switching port / config.
-Don't re-start an already-running dev server \u2014 call \`list_jobs\` first when in doubt.
+\`run_command\` blocks until exit \u2014 use for tests / builds / lints / typechecks / git / one-shot scripts under a minute. \`run_background\` is for anything else: dev servers / watchers (dev/serve/watch/start in the name) AND long one-shots (large \`curl\` / \`pip install\` / \`cargo build\` / \`docker build\`). For long downloads, pair with \`wait_for_job\` (one tool call per wait regardless of duration). Don't restart a running dev server \u2014 \`list_jobs\` first.
 # Scope discipline on "run it" / "start it" requests
-When the user's request is to **run / start / launch / serve / boot up** something, your job is ONLY:
-1. Start it (\`run_background\` for dev servers, \`run_command\` for one-shots).
-2. Verify it came up (read a ready signal via \`job_output\`, or fetch the URL with \`web_fetch\` if they want you to confirm).
-3. Report what's running, where (URL / port / pid), and STOP.
-Do NOT, in the same turn:
-- Run \`tsc\` / type-checkers / linters unless the user asked for it.
-- Scan for bugs to "proactively" fix. The page rendering is success.
-- Clean up unused imports, dead code, or refactor "while you're here."
-- Edit files to improve anything the user didn't mention.
-If you notice an obvious issue, MENTION it in one sentence and wait for the user to say "fix it." The cost of over-eagerness is real: you burn tokens, make surprise edits the user didn't want, and chain into cascading "fix the new error I just introduced" loops. The storm-breaker will cut you off, but the user still sees the mess.
-"It works" is the end state. Resist the urge to polish.
+When the user says run / start / launch / serve / boot up: start it, verify it came up, report what's running and STOP. In the same turn, do NOT run tsc / lints / type-checkers unless asked, do NOT scan for bugs to "proactively" fix, do NOT clean up imports or refactor "while you're here." If you notice an issue, mention in one sentence and wait. "It works" is the end state \u2014 resist the urge to polish.
 # Style
 - Show edits; don't narrate them in prose. "Here's the fix:" is enough.
 - One short paragraph explaining *why*, then the blocks.
-- If you need to explore first (list / read / search), do it with tool calls before writing any prose \u2014 silence while exploring is fine.
+- Silence during exploration is fine \u2014 tool calls first, prose after.
 __ESCALATION_CONTRACT__
@@ -15294,8 +15333,18 @@ You have BOTH \`semantic_search\` (vector index) and \`search_content\` (literal
 - **Exact-token queries** (a specific identifier, regex, or "find every call to foo") \u2192 call \`search_content\`.
 If \`semantic_search\` returns nothing useful (low scores, off-topic), THEN fall back to \`search_content\`. Don't go the other way \u2014 grepping a paraphrased question wastes turns.`;
+var ENGINEERING_LIFECYCLE_CONTRACT = `
+# Engineering lifecycle contract
+Reasonix may enforce a prefix-stable Engineering Lifecycle for explicitly enabled high-risk engineering work. The runtime keeps lifecycle state outside the system prompt and tool list, so do not expect stage-specific prompt changes or new tools to appear. Treat any lifecycle block as a host constraint, not as a suggestion.
+When high-risk mutations are bounced with \`rejectedReason: "engineering-lifecycle"\`, switch to read-only exploration, then call \`submit_plan\` with concrete steps before trying the mutation again. Add optional per-step \`targets\`, \`acceptance\`, and \`verification\` fields when they clarify scope or success criteria. For medium/high-risk steps, steps with verification criteria, or steps that changed code, \`mark_step_complete\` requires \`evidence\` entries such as verification output, diff summary, checkpoint id, or manual rationale.`;
 function codeSystemPrompt(rootDir, opts = {}) {
-  const codeBase = codeSystemBase(opts.modelId ?? DEFAULT_CODE_MODEL);
+  let codeBase = codeSystemBase(opts.modelId ?? DEFAULT_CODE_MODEL);
+  if (opts.engineeringLifecycleMode === "strict") {
+    codeBase = `${codeBase}${ENGINEERING_LIFECYCLE_CONTRACT}`;
+  }
   const base = opts.hasSemanticSearch ? `${codeBase}${SEMANTIC_SEARCH_ROUTING}` : codeBase;
   const withMemory = applyMemoryStack(base, rootDir);
   const gitignorePath = join15(rootDir, ".gitignore");
@@ -15348,10 +15397,10 @@ import {
   unlinkSync as unlinkSync4,
   writeFileSync as writeFileSync7
 } from "fs";
-import { homedir as homedir8 } from "os";
+import { homedir as homedir9 } from "os";
 import { dirname as dirname9, join as join16 } from "path";
 function defaultUsageLogPath(homeDirOverride) {
-  return join16(homeDirOverride ?? homedir8(), ".reasonix", "usage.jsonl");
+  return join16(homeDirOverride ?? homedir9(), ".reasonix", "usage.jsonl");
 }
 var USAGE_COMPACTION_THRESHOLD_BYTES = 5 * 1024 * 1024;
 var USAGE_RETENTION_DAYS = 365;