npm - @semalt-ai/code - Versions diffs - 1.19.0 → 1.20.0 - Mend

@semalt-ai/code 1.19.0 → 1.20.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (81) hide show

package/.claude/settings.local.json +2 -1
package/ARCHITECTURE.md +6 -95
package/CLAUDE.md +196 -1874
package/README.md +1 -1
package/docs/ARCHITECTURE.md +1321 -0
package/docs/CONFIG.md +340 -0
package/docs/HISTORY.md +245 -0
package/index.js +1 -1
package/lib/agent.js +145 -16
package/lib/api.js +28 -3
package/lib/commands/chat-session.js +187 -4
package/lib/commands/chat-slash.js +16 -0
package/lib/commands/chat-turn.js +272 -49
package/lib/commands/chat.js +12 -8
package/lib/config.js +27 -0
package/lib/constants.js +30 -1
package/lib/headless.js +36 -1
package/lib/images.js +8 -2
package/lib/permissions.js +23 -16
package/lib/prompts.js +15 -3
package/lib/tool_registry.js +357 -53
package/lib/tool_specs.js +42 -8
package/lib/tools.js +80 -19
package/lib/ui/anim.js +86 -0
package/lib/ui/ansi.js +17 -27
package/lib/ui/chat-history.js +253 -71
package/lib/ui/create-ui.js +67 -24
package/lib/ui/diff.js +90 -25
package/lib/ui/file-activity.js +236 -0
package/lib/ui/format.js +173 -28
package/lib/ui/input-field.js +5 -4
package/lib/ui/md-stream.js +234 -0
package/lib/ui/render-operation.js +113 -0
package/lib/ui/select.js +1 -4
package/lib/ui/status-bar.js +99 -57
package/lib/ui/stream.js +20 -13
package/lib/ui/theme.js +190 -45
package/lib/ui/tool-operation.js +190 -0
package/lib/ui/utils.js +9 -5
package/lib/ui/web-activity.js +58 -6
package/lib/ui/writer.js +159 -45
package/lib/ui.js +1 -1
package/package.json +1 -1
package/test/anim-driver.test.js +153 -0
package/test/ask-user-display.test.js +226 -0
package/test/ask-user-gate.test.js +231 -0
package/test/chat-history-nocolor.test.js +155 -0
package/test/chat-relogin.test.js +207 -0
package/test/defer-detail-band.test.js +403 -0
package/test/detail-band-tab-flatten.test.js +242 -0
package/test/exec-diff.test.js +268 -0
package/test/executors.test.js +250 -13
package/test/extract-tool-calls.test.js +37 -3
package/test/file-activity.test.js +522 -0
package/test/grep-path-target.test.js +227 -0
package/test/harness/chat-harness.js +2 -1
package/test/headless.test.js +146 -1
package/test/input-field-ctrl-o.test.js +37 -0
package/test/live-height-physical.test.js +281 -0
package/test/max-iterations.test.js +9 -7
package/test/md-stream.test.js +183 -0
package/test/native-dispatch.test.js +53 -0
package/test/native-live-narration.test.js +254 -0
package/test/output-heredoc-leak.test.js +195 -0
package/test/output-preview.test.js +245 -0
package/test/permissions.test.js +199 -0
package/test/read-paginate.test.js +1 -1
package/test/render-operation.test.js +317 -0
package/test/replay-descriptor-xml.test.js +216 -0
package/test/replay-descriptor.test.js +189 -0
package/test/replay-web-aggregate.test.js +291 -0
package/test/replay-web-persist.test.js +241 -0
package/test/running-glyph-anim.test.js +111 -0
package/test/status-bar-driver.test.js +93 -0
package/test/status-bar-resync.test.js +188 -0
package/test/stream-parser.test.js +24 -0
package/test/theme-palette.test.js +166 -0
package/test/truncate-visible.test.js +78 -0
package/test/view-image.test.js +199 -0
package/test/web-activity-ordering.test.js +12 -3
package/path +0 -1

package/docs/HISTORY.md ADDED Viewed

@@ -0,0 +1,245 @@
+# semalt-code — History, Decisions & Rationale
+> Dependency-policy rationale, the full "Key Patterns & Invariants" reference,
+> and the "Deferred / Not Yet Implemented" roadmap. **Not auto-loaded** as project
+> memory. The lean `CLAUDE.md` carries the compressed, verified invariant set;
+> this file preserves the long-form rationale and the per-task history.
+> Per-task (Task X.Y) rationale and the "Tested by …" enumerations live inline in
+> `docs/ARCHITECTURE.md` alongside each subsystem they describe.
+---
+## Dependency & Supply-Chain Policy (Task 3.2)
+The project ran **zero runtime dependencies** through Phase 2. Adopting the official
+MCP SDK (`@modelcontextprotocol/sdk`) in v1.9.0 ends that era. The invariant is now
+**minimal, vetted, pinned dependencies** — not "no dependencies."
+**When a runtime dependency is allowed.** Every new runtime dependency must be:
+1. **Minimal** — preferred only when a Node.js built-in genuinely cannot do the job.
+   The bar for the *first* dependency was high on purpose; the bar for the next one
+   is the same. Dev-only tooling is still avoided (we lint with `node --check` and
+   test with `node:test`).
+2. **Justified** — a one-line rationale recorded here (see below) and in the PR.
+3. **Pinned to an exact version** — no `^`/`~`/ranges in `package.json`. Upgrades are
+   deliberate, reviewed commits, never silent on `npm install`.
+4. **Reviewed** — adding/bumping a dependency is a reviewed change, and the
+   regenerated `package-lock.json` is committed in the same PR.
+**Rationale for the web-extraction deps (Task W.1, all pinned exact).** The
+web-fetch pipeline (see **Web Fetch Pipeline** below) turns raw HTML into
+main-content Markdown — reliably parsing real-world malformed HTML, scoring the
+main article over chrome, and emitting clean Markdown are each large, bug-prone
+surfaces where a hand-rolled regex approach is exactly the wrong call (quality is
+the whole point). The chosen libraries are the reference implementations:
+- **`@mozilla/readability` (`0.6.0`)** — Firefox Reader View's extractor; the
+  de-facto standard for "main content of a page." MIT. **Zero transitive deps.**
+- **`turndown` (`7.2.4`)** — the reference HTML→Markdown converter. MIT. One
+  transitive dep (`@mixmark-io/domino`, a DOM impl).
+- **`linkedom` (`0.18.12`)** — a light DOM for Readability to operate on
+  (`jsdom` is far heavier and unnecessary here). MIT. Transitive footprint:
+  `css-select`, `css-what`, `boolbase`, `nth-check`, `domhandler`,
+  `domelementtype`, `domutils`, `dom-serializer`, `entities`, `cssom`,
+  `htmlparser2`, `html-escaper`, `uhyphen` (`canvas` is an *optional* dep, left
+  uninstalled). **Total added: ~18 packages, `npm audit` clean (0 advisories).**
+All three are loaded directly (CommonJS-compatible) from `lib/web-extract.js` —
+no ESM boundary needed (unlike the MCP SDK).
+**Rationale for `@modelcontextprotocol/sdk` (pinned `1.29.0`).** MCP is an open
+protocol with a non-trivial wire contract (JSON-RPC framing, capability negotiation,
+transport lifecycle, schema validation). Reimplementing it by hand would be a large,
+bug-prone surface to own and keep in spec. The **official** SDK is the reference
+implementation, MIT-licensed, and tracks the spec — exactly the case where a vetted
+dependency beats a built-in reimplementation. It is the foundation Task 3.3 builds the
+MCP client on.
+**ESM/CJS boundary.** The SDK is **ESM-only** (`"type": "module"`); this project is
+CommonJS. A CJS module cannot `require()` an ESM-only package. The entire codebase
+stays CommonJS — the SDK is loaded in exactly one place, `lib/mcp/boundary.js`, via
+dynamic `import()`, which re-exposes a CJS-friendly async surface (`loadSdk`,
+`createClient`, `createStdioTransport`). No other module imports the SDK directly.
+See **MCP Boundary** below.
+**Lockfile + CI guardrails.** `package-lock.json` is committed. CI (`.github/workflows/ci.yml`) runs:
+- `npm ci` — installs strictly from the lockfile; fails on package.json↔lockfile drift (integrity).
+- `npm audit --omit=dev --audit-level=high` — fails the build on a **HIGH or CRITICAL**
+  advisory in the **runtime** (production) dependency tree. Dev deps are excluded
+  (there are none today).
+**Audit-findings policy.** When `npm audit` flags an advisory:
+- **Critical / High** → **blocking.** CI fails. Resolve before merge by bumping to a
+  patched pinned version (regenerate + commit the lockfile), or — if no fix exists —
+  removing/replacing the dependency. A temporary, time-boxed exception requires an
+  explicit `npm audit` allow-list entry **with a written justification and a tracking
+  issue**; it is not the default.
+- **Moderate / Low** → **non-blocking** (the `--audit-level=high` gate lets them pass)
+  but **tracked**: open an issue and address on the next dependency-maintenance pass.
+  Do not raise the gate to fail on these without agreement — noisy gates get ignored.
+- **Routine maintenance** → periodically run `npm audit` and `npm outdated`; dependency
+  bumps follow the pinning + review rules above.
+---
+## Key Patterns & Invariants
+- **Minimal, pinned dependencies**: prefer Node.js built-ins; a runtime dependency must be minimal, justified, pinned to an exact version, and reviewed (see **Dependency & Supply-Chain Policy**). Today: `@modelcontextprotocol/sdk` (MCP) and the web-extraction set `@mozilla/readability` + `linkedom` + `turndown` (Task W.1).
+- **CommonJS**: all files use `require()`/`module.exports`. Do not use ES `import`/`export`. The one exception is the **dynamic** `import()` inside `lib/mcp/boundary.js`, which is the sole bridge to the ESM-only MCP SDK — the project itself stays CommonJS.
+- **Streaming**: `api.js` manually parses `text/event-stream`. The parser in `chatStream()` handles partial JSON lines — be careful editing it.
+- **Permissions are per-session**: `PermissionManager` resets on each CLI invocation. Approvals never persist to disk. In non-TTY mode tool calls that would normally need interactive confirmation are **refused** (not auto-approved) unless `--dangerously-skip-permissions` is set, or the tag is pre-approved by an `--allow-*` tier flag.
+- **Destructive-command deny-list** (`lib/deny.js`): every shell call (`exec`/`shell`) passes through `classifyShellCommand()` at the single chokepoint in `agentExecShell`, in *all* modes and regardless of `--allow-*` flags. Handling depends on the **initiator**:
+  - **Agent-initiated** (the model asked, the default): any deny-list hit is a **hard block** — `rm -rf`, `curl … | sh`, disk-wipe/fork-bomb patterns, recursive chmod/chown on a system root, and writes to system paths.
+  - **User-initiated** (a human typed `!cmd` or `semalt-code shell`): the user owns their machine, so a deny-list hit is **not** hard-blocked. The exception is the **catastrophic subset** (`catastrophic: true` — disk-wipe / block-device write, fork bomb), which interposes a single y/N confirmation as a typo guard; all other deny-listed user commands run with a `bypassed` note.
+  - The only full bypass (skips classification entirely) is `--dangerously-skip-permissions`.
+  - **Cross-platform + canonicalized (Task 4.4):** the list now covers the
+    **Windows** destructive set (`del /s`, `rd`/`rmdir /s`, `Remove-Item -Recurse
+    -Force`, `format`, `Format-Volume`, `Clear-Disk`, `cipher /w`, `diskpart …
+    clean`) in addition to POSIX — relevant because native Windows has no OS
+    sandbox. Matching also runs against a **procfs-root-canonicalized** variant
+    (`/proc/self/root` and `/proc/<pid>/root` rewritten to `/`) so a
+    `/proc/self/root/etc/…` bypass is caught by the same system-path matchers
+    (the resolved-path principle, shared with the OS sandbox).
+- **Untrusted web content**: `http_get` runs the **web-fetch pipeline** (Task W.1 / W.1b, `mode` = summarized→extract→Markdown→secondary-LLM summary / extracted→Markdown / raw→original token-capped content) so by default only a compact result enters context (`raw` mode deliberately returns the original markup, still **token-capped**, for page analysis); the result in **every** mode is wrapped in the explicit `<<<UNTRUSTED_EXTERNAL_CONTENT>>>` block (`lib/agent.js`), and the secondary summarizer treats the page as data-only (a page injection could have steered it). The system prompt (`lib/prompts.js`) instructs the model never to act on instructions inside such a block. MCP tool results and **lifecycle-hook output** reuse the same fence. See **Web Fetch Pipeline**.
+- **Lifecycle hooks are deny-listed + sandboxed shell + untrusted output** (`lib/hooks.js`): a `PreToolUse` non-zero exit blocks the tool; every hook command passes through `checkShellDenylist` AND the **OS sandbox** (`resolveSandboxedSpawn`, Pre-Task 5.0a) before running; hook stdout is fenced as untrusted before it reaches the model; timeouts/sandbox-refusals/failures are contained and never crash the loop. **Project-layer command hooks and `verify.command` are quarantined** (`loadHookLayers`/`loadVerifyLayers`): a cloned-repo `.semalt/config.json` can never introduce host-privileged execution, only inert prompt text.
+- **`--readonly` blocks every file-mutating tool** (`READONLY_BLOCKED`, `lib/permissions.js`, completed in Pre-Task 5.0c): `write_file`, `append_file`, `edit_file`, `replace_in_file`, `delete_file`, `make_dir`, `remove_dir`, `move_file`, `copy_file`, `upload`, `download`. The block is enforced at the executor (`permissionManager.readonlyBlock(tag)`), so it holds for both the XML and native paths; `describePermission` also short-circuits the gate (no approval prompt precedes the deterministic block). **Scope decision (load-bearing): `--readonly` governs FILE TOOLS only.** Shell (`exec`/`shell`) is **not** in the set — a read-only session must still run read-only commands (`ls`, `git status`), and a shell command's arbitrary write side effects are the **OS sandbox + deny-list's** job to confine (the right layer post-Pre-Task 5.0a), not `--readonly`. So `--readonly` is an honest "no file-tool writes," not a false "no writes at all." Read-only file tools (`read_file`, `grep`, `glob`, `search_in_file`, `file_stat`, `list_dir`) work unchanged. Tested by `test/readonly-tools.test.js`.
+- **Secret-file read guard**: `isProtectedSecretPath()` in `tools.js` refuses reads/copies/moves of `config.json`, `memory.json`, and `audit.log` via file tools — **not** overridable by `--allow-anywhere` (only by `--dangerously-skip-permissions`).
+- **Config-write guard** (`isProtectedConfigPath()` in `tools.js`, Pre-Task 5.0b): the write-side companion to the read guard. Every write executor (`write_file`, `append_file`, `edit_file`, `replace_in_file`, `move_file`/`copy_file` **dst**, `upload`, `download`) refuses to write into the **protected-config set** — the whole `~/.semalt-ai` dir **and** every project `.semalt` dir from the CWD up to the repo root, **including files that do not yet exist** (directory-prefix matched on the resolved path, so a missing `.semalt/config.json`/`agents/*.md`/hook is covered). The set is defined once as `protectedConfigDirs` (`lib/constants.js`) and shared with the OS sandbox's `protectedPaths`. Same bypass policy as the read guard: **not** overridable by `--allow-anywhere`, only by `--dangerously-skip-permissions` (human-only). This guards the **agent's** file tools and the sandboxed shell — a human editing their own config in an editor is unaffected. Tested by `test/config-write-guard*.test.js`, `test/path-guards.test.js`, and the kernel case in `test/sandbox-integration.test.js`.
+- **Per-pattern permission rules** (`lib/permission-rules.js`, Task 4.1): allow/deny/ask rules matching tool + argument (glob/regex), layered user→project. **Project rules can only NARROW** — every project `allow` is structurally dropped before resolution, so a cloned-repo `.semalt/config.json` can never widen the user posture. Precedence is total/deterministic (deny>ask>allow, most-specific then most-restrictive). Arguments are canonicalized (`..`/symlink/abs-rel) before matching; pathological/malformed rules fail closed; an `allow` never bypasses the deny-list, secret guard, `--readonly`, or `isPathSafe` (those stay in the executors). A `deny` rule holds even under `--dangerously-skip-permissions`. See **Per-Pattern Permissions** above.
+- **Checkpoints & rewind** (`lib/checkpoints.js`, Task 4.3 / 4.3b): before each file-tool mutation the file's prior state is snapshotted (post-gate, pre-mutation, in `agentExecFile`) so `/rewind` can restore it — **file-tool changes only; shell side effects are not reversible.** Capture is fail-safe (a snapshot failure never blocks the mutation); a denied/withheld call produces no checkpoint; subagent mutations are checkpointed into the parent session. Delete/move are reversed explicitly; an external-modification check warns/asks before clobbering out-of-band edits. A per-file size cap and per-session retention are enforced. **Rewind is human-only (no rewind tool in the registry).** Task 4.3b: the restore path **re-validates the current guards** (`isPathSafe`/secret/protected-config/`deny` rule) per target — a now-forbidden path is refused/skipped, and `force` overrides only the external-mod check, not the guards; **three restore modes** `code`/`conversation`/`both` (default both) restore files, history, or the linked state, with conversation truncation cutting on **turn boundaries** (no orphaned `tool_call`; discard policy) — all on the **unchanged** on-disk schema. See **Checkpoints & Rewind** above.
+- **Native git tools** (`lib/tool_registry.js`, Task 5.1): eight first-class git tools shelling out through the **same** `agentExecShell` sandbox + deny-list chokepoint as `<shell>` (no privileged path around confinement), parsing output into structured results. Read-only (`git_status`/`git_diff`/`git_log`, plus the *list* ops of `git_branch`/`git_worktree`) return a null permission descriptor; mutating (`git_add`/`git_commit`/`git_branch`/`git_checkout`/`git_worktree` add/remove) require approval, honor `--readonly`, and pass the per-pattern rules. `git_commit` requires a real non-empty message (empty → error, never a placeholder). **Destructive-git ↔ checkpoint honesty:** git operations are NOT reversible via `/rewind` (checkpoints snapshot file-tool mutations only) — stated in the descriptions and prompt text. Not-a-repo / git-absent degrade gracefully. See **Native Git Tools** above.
+- **API-key sourcing** (`lib/secrets.js`): precedence is `SEMALT_API_KEY` env → OS keychain (macOS `security` / Linux `secret-tool` / Windows PasswordVault) → `config.json`. Keys from env/keychain are never written back to config; `configShow` reports only `api_key_source`. Store a key with `semalt-code auth set-key`.
+- **Token counting is approximate**: `estimateTokens()` divides char count by 4. It is used only for the `/compact` display — do not rely on it for hard limits.
+- **Context trimming is proactive when a limit is known**: `chatStream()` uses the in-process `_sessionInputLimits` learned from a prior 400 overflow first, then falls back to `config.context_length * 0.9`. When neither is set, no pre-flight trim runs and the client relies on the reactive 400/413 handler (which then persists the discovered window). `Metrics.tokenLimitStatus()` returns `{ used, limit: null }` until a limit is learned, so the status bar shows "N tok · limit unknown" instead of hiding the line.
+- **Shell/exec output entering context is bounded** (Task W.6, `capShellOutput` in `lib/agent.js`): the model-facing shell result is double-bounded — a **head+tail line cap** (`max_output_lines`, default 50, split first ~60% + last ~40% via `OUTPUT_HEAD_RATIO`) eliding the middle, **then** a **token safety net** (`max_output_tokens`, default 10000, reusing the web pipeline's `capToTokens`) so a few enormous lines (minified JS, a binary `cat`) can't blow context. The elision notice teaches the W.5-enabled redirect-to-file→grep pattern. **The exit code stays on its own line, so truncating output VOLUME never hides the command's OUTCOME** (a non-zero exit / failure is always surfaced). Applied at the context boundary in the agent loop — distinct from the **UI** cap (`lib/ui/diff.js`, display only), which stays. Before W.6 the cap was UI-only and the model received the **entire** unbounded stdout+stderr (the #1 context risk). Pure helper, unit-tested on the model-facing text + a real-loop assertion (`test/shell-output-cap.test.js`). MCP/subagent output bounding is Task W.8 (below); W.9 unifies all the paths into a shared chokepoint.
+- **MCP & subagent results entering context are bounded** (Task W.8, `formatMcpResult`/`formatSubagentResult` in `lib/agent.js`): the last two unbounded paths. Both apply `capToTokens` (the W.5–W.7 standard) to the result text **before** wrapping it in the `<<<UNTRUSTED_EXTERNAL_CONTENT>>>` fence, with **distinct budgets reflecting their nature**: **MCP is stricter** (`mcp.max_result_tokens`, default **10000**) because the payload size is third-party/server-controlled and untrusted — the riskiest path; **subagent is generous** (`subagents.max_result_tokens`, default **20000**) because the child's final text is our own deliberate, synthesized answer (a safety net against a verbose child). For MCP the truncation notice sits **inside** the fence with the capped content — capping never weakens the untrusted perimeter; subagent isolation / no-escalation (3.6/4.5) are unchanged (this bounds returned-text size only). A small result passes through fully, no notice. Pure helpers, unit-tested on the model-facing/parent-facing text incl. the fence-still-present and budgets-differ cases + real-loop assertions (`test/result-cap.test.js`).
+- **`read_file` is paginated** (Task W.7, `formatReadResult` in `lib/agent.js`): `read_file` used to dump the **whole file verbatim** into context (`File <path>:\n` + the entire content); the only guard was a hard byte refusal at `max_file_size_kb`. Worst case ~128k tokens for a 500 KB file. Now the **model-facing** result is paginated, mirroring the Claude Code standard: under a **line cap** (`read_line_cap`, default **2000**) the file reads **byte-for-byte as before** (no regression for the common small-file case); over the cap it returns the first page + a **`[PARTIAL]` notice** — `Showing lines 1–2000 of 5234. Read more with start_line=2001.` **`start_line`/`end_line`** (on both XML + native rails; absent → null, tuple parity) read an explicit slice, **also line-capped** so a huge explicit range can't dump everything. A **token safety net** (`read_max_tokens`, default **25000**, reusing the web pipeline's `capToTokens`) bounds the pathological few-but-enormous-lines case (one 100 KB minified line) the line cap misses — consistent with W.6's double-bound. The bound is applied at the **context boundary** in the formatter (the executor still returns the full content, like W.5/W.6); pagination — not the byte cap — is the primary bound, so `max_file_size_kb` is now a **backstop** (raised default **50 MB**) ruling out a multi-GB whole-file slurp (lower it to hard-refuse smaller files). **Line numbers are OPTIONAL, default OFF** (`show_line_numbers`): the **Step 0 finding** is that `edit_file` is **line-number-based** (`lines[N-1]=content`) while `replace_in_file` is **match-based** (regex on a search string) — a mix — so always-on numbers would corrupt copyable snippets for the match path **and** cost ~1.7× per read; the param turns absolute 1-based numbers on (aligned with `edit_file`'s addressing) for when the agent wants line refs to drive an edit. Line indexing matches `edit_file`'s `split('\n')` exactly, so the read→edit loop stays aligned. Pure helper, unit-tested on the model-facing text incl. the no-regression small-file case + the PARTIAL large-file case + rail parity + read→edit alignment (`test/read-paginate.test.js`).
+- **grep/glob results are serialized + bounded** (Task W.5, `formatGrepResult`/`formatGlobResult` in `lib/agent.js`): `formatFileResult` now has `case 'grep'`/`case 'glob'` that turn the structured engine result into model-facing text — closing a correctness bug where both fell through the default and the model received `"grep: done"`/`"glob: done"` (the data was computed and even shown in the UI, but never entered context, making grep-first navigation impossible). grep `output_mode` (`content`/`files_with_matches`/`count`) is model-selectable via the spec; `head_limit` (default `DEFAULT_GREP_HEAD_LIMIT`/`DEFAULT_GLOB_HEAD_LIMIT` = 100) + optional `offset` bound what reaches the model — the engine's 1000/5000 internal caps were never a context bound (the result was dropped before it reached context). Over-limit serialization carries a truncation notice telling the agent how to narrow (refine the pattern, switch to `count`/`files_with_matches`, or raise `head_limit`); under-limit results show fully with no notice. The executors (`lib/tool_registry.js`) normalize and attach `output_mode`/`head_limit`/`offset` onto the result; the serializers are pure and tested on the **model-facing** text (`test/grep-glob-serialize.test.js`, incl. the real-loop regression).
+- **Tool output enters context ONLY via the `boundToolOutput` chokepoint** (Task W.9, `lib/agent.js`): the size analogue of the `resolveSandboxedSpawn` sandbox chokepoint. W.5–W.8 each bounded a previously-unbounded path, but the `capToTokens`-+-fence step was duplicated ad-hoc in five places — the original bugs (grep/glob `"done"`, shell/MCP/subagent unbounded) were all the **same class**: a path that put output into context without bounding it. `boundToolOutput(text, { budget, notice, fenced })` is the **single application point**: it applies `capToTokens` with the path's **budget** and **notice** function and (when `fenced`) wraps in the `<<<UNTRUSTED_EXTERNAL_CONTENT>>>` fence. **grep/glob, shell, read_file, MCP, subagent — and http_get/web_search — all route through it.** The per-path policy is **deliberately distinct and NOT flattened**: budgets (MCP 10k < subagent 20k < read 25k; shell 10k; grep/glob `DEFAULT_GREP_GLOB_MAX_TOKENS` 10k — a new token net so a few huge minified match lines can't blow context, the W.6 lesson applied to grep's count-bound), notice wording (shell teaches redirect→grep, read teaches narrow-the-range, …), and the fence flag (MCP/subagent/web fenced; file/shell not). **Refactor-safe:** model-facing outputs are byte-identical to W.5–W.8 (the W.5–W.8 test suites pass unchanged); http_get/web_search bodies are already token-capped upstream so they pass **no budget** (fence only). **Structural regression prevention:** a new tool gets bounding by *routing* its output through the chokepoint, not by *remembering* to cap. Pure helper, unit-tested on the chokepoint behavior, per-path policy, the bound-by-construction invariant, and equivalence (`test/output-chokepoint.test.js`). The system prompt's `LOCAL_NAVIGATION_NOTICE` (`lib/prompts.js`, both templates) — now actionable post-W.5 — steers the grep-first / read-slice pattern: locate with `grep`/`glob` (`count`/`files_with_matches` modes), then `read_file` only the relevant `start_line`/`end_line` slice; redirect large command output to a file and grep it.
+- **Bounded agent iterations**: the primary loop caps at `config.max_iterations` (default 125, via `DEFAULT_MAX_ITERATIONS` in `constants.js`), overridable with `--max-iterations <n>`; `--max-iterations 0`/`"unlimited"` removes the cap deliberately. Reaching the cap stops gracefully (clear message + `stopReason: "max_iterations"`), never silently. Subagents have their own cap of 12.
+- **Malformed tags are skipped**: each tool dispatch in the agent loop is wrapped in try/catch; errors emit a warning line and continue to the next tool call.
+---
+## Deferred / Not Yet Implemented
+This section exists because false documentation has burned this project before (a
+"max 10 iterations" invariant that never existed; coverage assumed but absent). The
+items below are things a reader might reasonably expect from the docs or from peer
+tools but that the code **does not do today**. They are listed honestly so nobody
+builds on a feature that isn't there. Each is marked **Planned (Phase 4+)** —
+on the roadmap — or **Out of scope** — no current plan.
+**Gaps the re-audit found in existing behavior:**
+- **MCP in headless / one-shot** — *Planned (Phase 4+).* `connectAll()` runs only in
+  interactive `cmdChat` (and the `mcp` management commands); `code`/`edit`/`shell`/`-p`
+  never connect a manager, so MCP tools are unavailable there. See **MCP Client → Scope**.
+- **Session auto-resume** — *Planned (Phase 4+).* Sessions are saved, but there is no
+  startup prompt offering to resume the most recent (< 24 h) session. Resume is always
+  explicit: `/history` (local) or `--resume <id>` (dashboard). See **Session Storage**.
+- **Corporate-proxy consumption** — *Planned (Phase 4+).* `HTTPS_PROXY`/`HTTP_PROXY`
+  are parsed into config but `api.js` does not route requests through a proxy agent,
+  so they have no effect on outbound HTTP. See **Config hierarchy → Environment**.
+**Phase 4 roadmap (Planned, in the stated order):**
+- **Per-pattern permissions** — ✅ **Done (Task 4.1).** Rich allow/deny/ask rules
+  matching tool + argument (glob/regex), layered user→project. See **Per-Pattern
+  Permissions** above.
+- **Self-verification** — ✅ **Done (Task 4.2).** When the agent declares done,
+  optionally run a configured verify command (advisory feeds the result back;
+  enforcing returns the agent to the loop until verify passes, bounded by
+  `max_attempts` → `verify_failed`). See **Self-Verification** above.
+- **Checkpoints / rewind** — ✅ **Done (Task 4.3 file half + Task 4.3b
+  conversation + restore re-validation).** Per-write file snapshots before each
+  file-tool mutation; `/rewind` restores prior content (last or to a chosen
+  sequence), with delete/move handled and an external-modification check that never
+  silently clobbers out-of-band edits. **File-tool changes only — shell side
+  effects are not reversible.** Task 4.3b closed the last deferred 4.3 security
+  finding (the restore path now **re-validates the current
+  isPathSafe/secret/protected-config/`deny`-rule guards** per target — `force`
+  overrides only the external-mod check) and added **three restore modes**
+  (`code`/`conversation`/`both`, default both) using the existing turn-linkage,
+  with conversation truncation cutting on **turn boundaries** (no orphaned
+  `tool_call`; discard policy) on the **unchanged** on-disk schema. Rewind stays
+  **human-only** (no rewind tool registered). See **Checkpoints & Rewind** above.
+- **OS sandbox** — ✅ **Done (Task 4.4 filesystem + Task 4.4b network).** Real
+  OS-level confinement for shell commands: Seatbelt (macOS) / bubblewrap
+  (Linux/WSL2) jail every command and its children, confining writes to the working
+  dir and keeping `~/.semalt-ai`/secrets/`/etc` read-only (incl. not-yet-existing
+  files), with a fail-safe ask-or-block fallback when the primitive is absent and no
+  model-reachable way to disable it. **Network isolation is now done as well —
+  binary on/off** (bwrap `--unshare-net` / Seatbelt `(deny network*)`), no host
+  proxy / no domain allowlist / no TLS interception, anti-fail-open default. See
+  **OS Sandbox** above.
+**Done since:**
+- **Native git tooling** — ✅ **Done (Task 5.1).** Eight first-class git tools
+  (`git_status`/`git_diff`/`git_log` read-only; `git_add`/`git_commit`/`git_branch`/
+  `git_checkout` mutating; `git_worktree` infrastructure) shelling out through the
+  sandbox + deny-list chokepoint with structured results. The long tail stays in the
+  generic shell. See **Native Git Tools** above.
+- **Embedding SDK** — ✅ **Done (Task 5.2).** Two-tier library surface separated by
+  `package.json` `exports`: the stable `createAgent` facade (main entry) and the
+  unstable building blocks (`/internals`). Programmatic permission policy that
+  defaults to refusing mutations; sandbox/deny-list stay on with explicit opt-out;
+  `close()` teardown; per-instance config (process-global limits documented). See
+  **Embedding SDK** above.
+- **Background tasks** — ✅ **Done (Task 5.3).** `run --background` launches a
+  detached agent process (own process = own global state, reusing the
+  `createAgent` facade) with a launch-fixed, refuse-by-default policy and
+  sandbox/deny-list on; a file-based task registry (`~/.semalt-ai/tasks/`) drives
+  `tasks list|status|result|kill|prune`. Validation runs before detach (no
+  orphans); stale/dead tasks are detectable and prunable; kill tree-kills by PID.
+  Background-launch is intentionally NOT an agent tool. See **Background Tasks**
+  above.
+- **Multimodal image input** — ✅ **Done (Task 5.4).** PNG/JPEG/WebP/GIF attach via
+  `--image` (repeatable), in-chat `/image`, and the SDK `images` option; read
+  through `isPathSafe`, size-capped (`image_max_bytes`), base64-encoded, media
+  type detected from magic bytes. The provider content-part shape (Anthropic-style
+  vs OpenAI-style) is selected per profile/heuristic; a text-only model fails loud
+  (the image is never silently dropped). PDF input deferred; generation out of
+  scope. See **Multimodal Image Input** above.
+**Planned, not yet scheduled:**
+- **Cost caps** — hard spend limits per session/turn (today cost is *displayed* via
+  `lib/pricing.js`, never enforced).
+- **Auto-update** — self-updating the CLI (today: `npm install -g` manually).
+- **XDG / `%APPDATA%` config dirs** — honoring platform config-dir conventions instead
+  of the fixed `~/.semalt-ai/`.
+- **Domain-allowlist network policy** — *deliberately deferred, may stay out of
+  scope.* Task 4.4b ships **binary** network isolation (on / kernel-level none); a
+  per-domain allowlist ("allow github.com, block the rest") is **not** implemented
+  and is **not** a planned increment by default. **Rationale:** domain-granularity
+  requires a host-side egress proxy with full network privileges, which is the
+  exact design the reference implementation shipped and that was **bypassed
+  completely, twice, over 5.5 months** (allowedDomains fail-open CVE-2025-66479, a
+  hostname-parser differential, and TLS-MITM breaking Go binaries). We will only
+  revisit this if it can be done **without** a host proxy / TLS interception (e.g.
+  a kernel/eBPF egress filter on resolved IPs) — until then, binary isolation is
+  the robust posture. See **OS Sandbox → Why binary**.
+- **Native-Windows / WSL1 sandbox** — no OS primitive today (bwrap needs the
+  user/mount namespaces WSL1 lacks; native Windows has none). On those platforms
+  the sandbox degrades to the fail-safe fallback (ask-or-block); the Windows
+  deny-list (now covered, Task 4.4) is the remaining shell guard there.
+**Out of scope (no current plan):**
+- **Multimodal — image *input*** is ✅ **Done (Task 5.4)** — PNG/JPEG/WebP/GIF
+  attached via `--image` / `/image` / the SDK `images` option, sent provider-
+  specifically to vision models (text-only models fail loud). See **Multimodal
+  Image Input** above. Still out of scope: **PDF input** (deferred), **audio
+  input**, and **image/audio *generation* / output**.
+- **Background / cloud / scheduling** — long-running background agents, cloud execution,
+  or cron-style scheduling.
+- **OpenTelemetry** — OTel traces/metrics export.
+- **Managed policy** — centrally-administered org policy enforcement.
+- **Native notifications** — OS-level desktop notifications.
+---

package/index.js CHANGED Viewed

@@ -289,7 +289,7 @@ Options:
                           no host proxy, no domain allowlist, no TLS interception.
                           Same effect as sandbox.network "off" in config.
   --readonly              Block all write operations
-  --max-iterations <n>    Cap agent-loop iterations per turn (default 50);
+  --max-iterations <n>    Cap agent-loop iterations per turn (default 125);
                           0 or "unlimited" removes the cap (power-user choice)
   --no-verify             Skip self-verification (config.verify) for this run
   --dangerously-skip-permissions

package/lib/agent.js CHANGED Viewed

@@ -3,7 +3,7 @@
 const { logToolCall } = require('./audit');
 const { Metrics } = require('./metrics');
 const { getSystemPrompt, getPlanModeNotice } = require('./prompts');
-const { isNativeToolsActive } = require('./config');
+const { isNativeToolsActive, getInlineReasoning } = require('./config');
 const { TAG_REGISTRY, DEFAULT_MAX_ITERATIONS, DEFAULT_GREP_HEAD_LIMIT, DEFAULT_GLOB_HEAD_LIMIT, DEFAULT_GREP_GLOB_MAX_TOKENS, DEFAULT_MAX_OUTPUT_LINES, OUTPUT_HEAD_RATIO, DEFAULT_OUTPUT_MAX_TOKENS, DEFAULT_READ_LINE_CAP, DEFAULT_READ_MAX_TOKENS, DEFAULT_MCP_MAX_RESULT_TOKENS, DEFAULT_SUBAGENT_MAX_RESULT_TOKENS } = require('./constants');
 const { capToTokens, defaultEstimate, DEFAULT_CHARS_PER_TOKEN } = require('./web-extract');
 const { mapInvokeToCall } = require('./tools');
@@ -342,6 +342,19 @@ function truncateForDebug(text, maxLines = 40, maxChars = 2000) {
 // layer (commands.js) feeds the meta into formatToolLine together with
 // the tag, so the formatter can produce the 4-segment line in either the
 // pending (live region) or final (scrollback) context.
+// Phase 6a — build one native `{role:'tool'}` result message. `content` is the
+// model-facing bound result string, kept BYTE-IDENTICAL (Inv. 1). A serialized
+// display descriptor core (from onToolEnd), when present, rides along as a
+// sibling `_display` key — additive only, never part of `content`, and stripped
+// before the wire (see api.js) so it is never fed to the model. Replay
+// (chat-history.js) reads `_display` to render with full fidelity; its absence
+// falls back to the legacy summary.
+function _nativeToolMessage(toolCallId, content, displayCore) {
+  const msg = { role: 'tool', tool_call_id: toolCallId, content };
+  if (displayCore) msg._display = displayCore;
+  return msg;
+}
 function _metaForTool(tag, result) {
   if (!result || result.error) return null;
   switch (tag) {
@@ -383,6 +396,11 @@ function _metaForTool(tag, result) {
         bytes: result.size_kb ? Math.round(parseFloat(result.size_kb) * 1024) : 0,
         kind: result.type || null,
       };
+    case 'ask_user':
+      // Surface the user's chosen answer as display meta so the committed result
+      // line reads "✓ user · ask <question> · → <answer>". Display-only: the
+      // model-facing string (formatFileResult) still uses the full question.
+      return { answer: result.answer };
     default:
       return null;
   }
@@ -402,6 +420,7 @@ function _attrsFromCall(call) {
       return { command: args[0] || '' };
     case 'read':
     case 'read_file':
+    case 'view_image':
     case 'list_dir':
     case 'delete_file':
     case 'make_dir':
@@ -819,6 +838,15 @@ function createAgentRunner({ chatStream, extractToolCalls, agentExecShell, agent
           maxTokens: cfg.read_max_tokens,
         });
       }
+      case 'view_image':
+        // The encoded image rides on result.image and is attached to this turn's
+        // tool-result message by the loop below; the model-facing text is just a
+        // short confirmation. Wording is deliberate: the image is visible to the
+        // MODEL for analysis, NOT shown to the user — so the model must not refer
+        // to it as something the user can see.
+        return `Image ${result.path} (${result.media_type}, ${result.bytes} bytes) is now attached to your `
+          + `vision context — analyze it directly. It was made visible to YOU (the model) for analysis; it was `
+          + `NOT displayed to the user, so do not refer to it as something the user can see.`;
       case 'write':
         return `Wrote ${result.bytes} bytes to ${args[0]}`;
       case 'append':
@@ -982,6 +1010,11 @@ function createAgentRunner({ chatStream, extractToolCalls, agentExecShell, agent
     };
     const nativeTools = isNativeToolsActive(model);
+    // Live-narration safety signal (b): an explicit per-profile assertion that
+    // this model does NOT inline reasoning into delta.content. Only an explicit
+    // `false` is the eager-stream signal; undefined/true keep the safe buffered
+    // fallback. Threaded to the UI gate via onStreamStart alongside nativeTools.
+    const inlineReasoning = getInlineReasoning(model);
     // Checkpoint turn linkage (Task 4.3): tag every checkpoint captured during
     // this turn with the conversation point that produced it, so a future
@@ -1077,11 +1110,19 @@ function createAgentRunner({ chatStream, extractToolCalls, agentExecShell, agent
         ? (token) => {
             if (!streamStarted) {
               streamStarted = true;
-              if (cb.onStreamStart) cb.onStreamStart();
+              // Pass the rail + inline-reasoning assertion so the UI gate can
+              // decide whether it is safe to eager-open live narration on the
+              // native rail. The XML rail (nativeTools false) ignores both.
+              if (cb.onStreamStart) cb.onStreamStart(nativeTools, inlineReasoning);
             }
             parser.push(token);
           }
         : null;
+      // Live-narration safety signal (a): surface the first reasoning_content
+      // delta to the UI so it can eager-open the gate before content arrives.
+      const wrappedOnReasoning = cb.onReasoningStart
+        ? () => { cb.onReasoningStart(); }
+        : null;
       const MAX_RETRIES = 3;
       const RETRYABLE_STATUS = new Set([408, 425, 429, 500, 502, 503, 504]);
@@ -1108,6 +1149,7 @@ function createAgentRunner({ chatStream, extractToolCalls, agentExecShell, agent
               linePrefix: wrappedOnToken ? '' : linePrefix,
               showThink,
               onToken: wrappedOnToken,
+              onReasoning: wrappedOnReasoning,
               silent: !!wrappedOnToken,
               signal: controller.signal,
               onTrim: (info) => {
@@ -1312,7 +1354,19 @@ function createAgentRunner({ chatStream, extractToolCalls, agentExecShell, agent
           }
         }
       } else {
-        toolCalls = extractToolCalls(reply, { model });
+        // No structured native tool_calls this turn. Parse the text for tool
+        // calls — but on the NATIVE rail, suppress the soft TEXT HEURISTICS that
+        // infer commands from untagged prose (the bare ```bash/```sh/```shell
+        // fence pass). On the native rail a finish_reason=stop turn is usually a
+        // plain text final answer, and an illustrative ```bash block in that
+        // narration must NEVER be executed (the incident: a hung `su nobody` and
+        // two placeholder examples were run). EXPLICIT tool-tag dispatch
+        // (<exec>/<shell>/<write_file>/<minimax:tool_call>/<function=…>/MCP tags)
+        // is deliberate and unambiguous, so it stays active on BOTH rails — the
+        // native rail legitimately dispatches tools via those tags too. The XML
+        // rail keeps every heuristic (byte-identical to before): it has no
+        // structured channel, so the fence pass is part of its contract.
+        toolCalls = extractToolCalls(reply, { model, skipTextHeuristics: nativeTools });
       }
       const isNativeCall = nativeToolCalls.length > 0;
       const cleanedReply = cleanAssistantContent(reply);
@@ -1442,10 +1496,21 @@ function createAgentRunner({ chatStream, extractToolCalls, agentExecShell, agent
         assistantMsg.tool_calls = nativeToolCalls.filter((tc) => acceptedSet.has(tc.id));
       }
       messages.push(assistantMsg);
-      // When showThink is off and the turn has tool calls, suppress the text bubble —
-      // pre-tool reasoning is noise, tool result bubbles already convey what happened.
-      const displayReply = (!showThink && toolCalls.length > 0) ? '' : cleanedReply;
-      if (cb.onAssistantMessage) cb.onAssistantMessage(displayReply);
+      // Live narration (Claude-Code style): stream the model's pre-tool "what I'm
+      // about to do" text instead of blanking it when tools are present. `cleanedReply`
+      // has already had ALL reasoning stripped by cleanAssistantContent — the implicit
+      // </think> preamble (Qwen3-style) and any <think>/<reasoning>/<reflection>/<plan>
+      // blocks, plus the tool tags — so no hidden reasoning leaks into the bubble or
+      // persisted history. The implicit-think gate in chat-turn.js is the live-stream
+      // safety net for the token-by-token path; here we simply stop forcing the
+      // post-turn text to '' just because the iteration carried a tool call.
+      const displayReply = cleanedReply;
+      // `terminal` tells the UI a final answer from an intermediate tool-call
+      // iteration. Previously the UI used "content is empty" as that proxy (blanked
+      // tool iterations passed ''); now that intermediate iterations also carry
+      // narration, the proxy is gone — pass the real signal so web-activity collapse
+      // (which must only flush on the terminal answer) stays correct.
+      if (cb.onAssistantMessage) cb.onAssistantMessage(displayReply, { terminal: toolCalls.length === 0 });
       if (toolCalls.length === 0) {
         // Native mode: tool_calls came in but none could be converted (parse
@@ -1567,6 +1632,17 @@ function createAgentRunner({ chatStream, extractToolCalls, agentExecShell, agent
       }
       const results = [];
+      // view_image staging: encoded image records returned by view_image executors
+      // this turn. Collected here and attached to the tool-result message's
+      // `images[]` below, so api.js buildProviderMessages turns them into provider
+      // vision blocks on the NEXT model turn (the same wire path /image uses).
+      const stagedImages = [];
+      // Phase 6a — serialized display descriptor cores, pushed in LOCKSTEP with
+      // `results` (one entry per result, null when there is no descriptor — e.g.
+      // a denied/withheld/hook-blocked call never reaches onToolEnd). Since
+      // results[i] ↔ nativeToolCallIds[i] ↔ toolCalls[i], displayCores[i] aligns
+      // with the native tool message pushed below and rides along as `_display`.
+      const displayCores = [];
       const debugEntries = debug ? [] : null;
       let aborted = false;
@@ -1621,6 +1697,7 @@ function createAgentRunner({ chatStream, extractToolCalls, agentExecShell, agent
               if (cb.onError) cb.onError({ message: `PreToolUse hook blocked ${tag}.`, isWarning: true });
               logToolCall(tag, { args: call.slice(1) }, false, 'hook-blocked');
               results.push(resultStr);
+              displayCores.push(null);
               if (debugEntries) debugEntries.push({ tag, call, ms: 0, status: 'hook_blocked', exitCode: null, result: resultStr });
               continue;
             }
@@ -1663,6 +1740,7 @@ function createAgentRunner({ chatStream, extractToolCalls, agentExecShell, agent
             if (cb.onError) cb.onError({ message: `Permission rule denied ${tag} (${ruleVerdict.reason}).`, isWarning: true });
             logToolCall((permDesc && permDesc.tag) || tag, { args: call.slice(1) }, false, `rule-denied:${ruleVerdict.reason}`);
             results.push(resultStr);
+            displayCores.push(null);
             if (debugEntries) debugEntries.push({ tag, call, ms: 0, status: 'rule_denied', exitCode: null, result: resultStr, rule: ruleVerdict.reason });
             continue;
           }
@@ -1680,6 +1758,7 @@ function createAgentRunner({ chatStream, extractToolCalls, agentExecShell, agent
             if (cb.onPlanWithhold) cb.onPlanWithhold(tag, arg, permDesc);
             logToolCall(permDesc.tag || tag, { args: call.slice(1) }, false, 'withheld');
             results.push(resultStr);
+            displayCores.push(null);
             if (debugEntries) debugEntries.push({ tag, call, ms: 0, status: 'withheld', exitCode: null, result: resultStr });
             continue;
           }
@@ -1707,6 +1786,7 @@ function createAgentRunner({ chatStream, extractToolCalls, agentExecShell, agent
                 : `${tag} ${arg}: Permission denied by user.${reasonSuffix}`;
               logToolCall(permTag, { args: call.slice(1) }, false, 'denied');
               results.push(resultStr);
+              displayCores.push(null);
               if (debugEntries) debugEntries.push({ tag, call, ms: 0, status: 'denied', exitCode: null, result: resultStr, rule: ruleVerdict.reason || undefined });
               aborted = true;
               break;
@@ -1733,8 +1813,9 @@ function createAgentRunner({ chatStream, extractToolCalls, agentExecShell, agent
                 const oneLine = String(arg).replace(/\s+/g, ' ').trim();
                 const truncatedCmd = oneLine.length > 80 ? oneLine.slice(0, 77) + '...' : oneLine;
                 const resultStr = `User interrupted execution after ${elapsedS}s. Tool was running: ${truncatedCmd}. Plan around this — do not retry the same long-running command.`;
-                if (cb.onToolEnd) cb.onToolEnd(tag, resultStr, ms, { id: invocationId, call, attrs, meta: null, error: { message: 'aborted' } });
+                const displayCore = cb.onToolEnd ? cb.onToolEnd(tag, resultStr, ms, { id: invocationId, call, attrs, meta: null, error: { message: 'aborted' } }) : null;
                 results.push(resultStr);
+                displayCores.push(displayCore || null);
                 if (debugEntries) debugEntries.push({ tag, call, ms, status: 'aborted', exitCode: null, result: resultStr });
                 aborted = true;
                 break;
@@ -1754,8 +1835,9 @@ function createAgentRunner({ chatStream, extractToolCalls, agentExecShell, agent
                 const error = shellResult.exit_code !== 0
                   ? { message: `exit ${shellResult.exit_code}`, code: shellResult.exit_code }
                   : null;
-                if (cb.onToolEnd) cb.onToolEnd(tag, resultStr, ms, { id: invocationId, call, attrs, meta, error });
+                const displayCore = cb.onToolEnd ? cb.onToolEnd(tag, resultStr, ms, { id: invocationId, call, attrs, meta, error }) : null;
                 results.push(await augmentWithHooks(tag, attrs, resultStr, preFeedback));
+                displayCores.push(displayCore || null);
                 if (debugEntries) debugEntries.push({
                   tag,
                   call,
@@ -1783,19 +1865,35 @@ function createAgentRunner({ chatStream, extractToolCalls, agentExecShell, agent
               const oneLine = String(arg).replace(/\s+/g, ' ').trim();
               const truncatedArg = oneLine.length > 80 ? oneLine.slice(0, 77) + '...' : oneLine;
               const resultStr = `User interrupted execution after ${elapsedS}s. Tool was running: ${tag} ${truncatedArg}. Plan around this — do not retry the same long-running operation.`;
-              if (cb.onToolEnd) cb.onToolEnd(tag, resultStr, ms, { id: invocationId, call, attrs, meta: null, error: { message: 'aborted' } });
+              const displayCore = cb.onToolEnd ? cb.onToolEnd(tag, resultStr, ms, { id: invocationId, call, attrs, meta: null, error: { message: 'aborted' } }) : null;
               results.push(resultStr);
+              displayCores.push(displayCore || null);
               if (debugEntries) debugEntries.push({ tag, call, ms, status: 'aborted', exitCode: null, result: resultStr });
               aborted = true;
               break;
             } else {
               const resultStr = formatFileResult(call, fileResult);
+              // view_image: stage the encoded image so it attaches to this turn's
+              // tool-result message (below) and reaches the model as a vision block
+              // next turn — same mechanism /image uses, no parallel encoder.
+              if (fileResult && fileResult.image && typeof fileResult.image.data === 'string') {
+                stagedImages.push(fileResult.image);
+              }
               const meta = _metaForTool(tag, fileResult);
               const error = fileResult.error
                 ? { message: fileResult.error, code: fileResult.error_code || null }
                 : null;
-              if (cb.onToolEnd) cb.onToolEnd(tag, resultStr, ms, { id: invocationId, call, attrs, meta, error });
+              // File-edit diff payload (execution-time rendering). Mutating file
+              // tools attach _diffBefore/_diffAfter; hand them to onToolEnd so the
+              // UI renders the diff for EVERY edit, independent of the permission
+              // modal or approval state. Absent on non-mutating/loaded calls → null.
+              const diff = (fileResult && typeof fileResult._diffBefore === 'string'
+                && typeof fileResult._diffAfter === 'string')
+                ? { before: fileResult._diffBefore, after: fileResult._diffAfter, path: fileResult.path || call[1] }
+                : null;
+              const displayCore = cb.onToolEnd ? cb.onToolEnd(tag, resultStr, ms, { id: invocationId, call, attrs, meta, error, diff }) : null;
               results.push(await augmentWithHooks(tag, attrs, resultStr, preFeedback));
+              displayCores.push(displayCore || null);
               if (debugEntries) debugEntries.push({
                 tag,
                 call,
@@ -1807,7 +1905,7 @@ function createAgentRunner({ chatStream, extractToolCalls, agentExecShell, agent
             }
           } catch (err) {
             const ms = Date.now() - toolStart;
-            if (cb.onToolEnd) cb.onToolEnd(tag, `Error: ${err.message}`, ms, { id: invocationId, call, attrs, meta: null, error: err });
+            const displayCore = cb.onToolEnd ? cb.onToolEnd(tag, `Error: ${err.message}`, ms, { id: invocationId, call, attrs, meta: null, error: err }) : null;
             if (cb.onError) {
               cb.onError({ message: `Tool error (${tag}): ${err.message}`, isWarning: true });
             } else {
@@ -1815,6 +1913,7 @@ function createAgentRunner({ chatStream, extractToolCalls, agentExecShell, agent
             }
             logToolCall(tag, { args: call.slice(1) }, false, 'error');
             results.push(`${tag}: Error — ${err.message}`);
+            displayCores.push(displayCore || null);
             if (debugEntries) debugEntries.push({ tag, call, ms, status: 'exception', exitCode: null, result: `Error — ${err.message}` });
           }
         }
@@ -1891,7 +1990,7 @@ function createAgentRunner({ chatStream, extractToolCalls, agentExecShell, agent
           const reason = isAborted() ? 'user interrupted' : 'after user denied an action';
           if (isNativeCall) {
             for (let i = 0; i < results.length; i++) {
-              messages.push({ role: 'tool', tool_call_id: nativeToolCallIds[i], content: results[i] });
+              messages.push(_nativeToolMessage(nativeToolCallIds[i], results[i], displayCores[i]));
             }
           } else {
             messages.push({
@@ -1905,14 +2004,44 @@ function createAgentRunner({ chatStream, extractToolCalls, agentExecShell, agent
       if (isNativeCall) {
         for (let i = 0; i < results.length; i++) {
-          messages.push({ role: 'tool', tool_call_id: nativeToolCallIds[i], content: results[i] });
+          messages.push(_nativeToolMessage(nativeToolCallIds[i], results[i], displayCores[i]));
+        }
+        // view_image on the native rail: OpenAI `tool` messages can't carry image
+        // parts, so stage the encoded image(s) on a trailing user turn (exactly the
+        // /image mechanism). buildProviderMessages turns `images[]` into vision
+        // blocks on the next request; the text result already landed on the tool
+        // message above.
+        if (stagedImages.length) {
+          messages.push({
+            role: 'user',
+            content: 'The image(s) requested via view_image are attached to this message for your analysis. '
+              + 'They are visible to you (the model) only — not shown to the user.',
+            images: stagedImages,
+          });
         }
       } else {
         const feedback = results.join('\n\n');
-        messages.push({
+        // Phase 6b — XML rail replay parity. The feedback blob folds every tool
+        // result of this turn into ONE {role:'user'} message and cannot be split
+        // back by parsing (the only separator, \n\n, appears freely inside result
+        // bodies). So persist the per-call display descriptors as a sibling
+        // `_display[]` aligned 1:1 with `results` (same serialized cores the
+        // native rail attaches, see _nativeToolMessage), preserving `null`s for
+        // ops with no descriptor. `content` stays BYTE-IDENTICAL (Inv. 1) — the
+        // model never sees `_display` (stripInternalKeys drops it before the wire).
+        // Replay (chat-session.displayLoadedMessages) only renders per-call when
+        // EVERY slot is a non-null known-version core; a single `null` (e.g. a web
+        // op, out of scope until 6c) keeps the whole blob on the legacy summary.
+        const resultsMsg = {
           role: 'user',
           content: `Tool execution results:\n\n${feedback}\n\nContinue with the task. If everything is done, summarize what was accomplished.`,
-        });
+          _display: displayCores.slice(),
+        };
+        // view_image on the XML rail: the tool-result blob is a single user
+        // message, which CAN carry image parts — attach the staged image(s) so
+        // buildProviderMessages renders them as vision blocks next turn.
+        if (stagedImages.length) resultsMsg.images = stagedImages;
+        messages.push(resultsMsg);
       }
     }