@semalt-ai/code 1.19.0 → 1.20.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (81) hide show
  1. package/.claude/settings.local.json +2 -1
  2. package/ARCHITECTURE.md +6 -95
  3. package/CLAUDE.md +196 -1874
  4. package/README.md +1 -1
  5. package/docs/ARCHITECTURE.md +1321 -0
  6. package/docs/CONFIG.md +340 -0
  7. package/docs/HISTORY.md +245 -0
  8. package/index.js +1 -1
  9. package/lib/agent.js +145 -16
  10. package/lib/api.js +28 -3
  11. package/lib/commands/chat-session.js +187 -4
  12. package/lib/commands/chat-slash.js +16 -0
  13. package/lib/commands/chat-turn.js +272 -49
  14. package/lib/commands/chat.js +12 -8
  15. package/lib/config.js +27 -0
  16. package/lib/constants.js +30 -1
  17. package/lib/headless.js +36 -1
  18. package/lib/images.js +8 -2
  19. package/lib/permissions.js +23 -16
  20. package/lib/prompts.js +15 -3
  21. package/lib/tool_registry.js +357 -53
  22. package/lib/tool_specs.js +42 -8
  23. package/lib/tools.js +80 -19
  24. package/lib/ui/anim.js +86 -0
  25. package/lib/ui/ansi.js +17 -27
  26. package/lib/ui/chat-history.js +253 -71
  27. package/lib/ui/create-ui.js +67 -24
  28. package/lib/ui/diff.js +90 -25
  29. package/lib/ui/file-activity.js +236 -0
  30. package/lib/ui/format.js +173 -28
  31. package/lib/ui/input-field.js +5 -4
  32. package/lib/ui/md-stream.js +234 -0
  33. package/lib/ui/render-operation.js +113 -0
  34. package/lib/ui/select.js +1 -4
  35. package/lib/ui/status-bar.js +99 -57
  36. package/lib/ui/stream.js +20 -13
  37. package/lib/ui/theme.js +190 -45
  38. package/lib/ui/tool-operation.js +190 -0
  39. package/lib/ui/utils.js +9 -5
  40. package/lib/ui/web-activity.js +58 -6
  41. package/lib/ui/writer.js +159 -45
  42. package/lib/ui.js +1 -1
  43. package/package.json +1 -1
  44. package/test/anim-driver.test.js +153 -0
  45. package/test/ask-user-display.test.js +226 -0
  46. package/test/ask-user-gate.test.js +231 -0
  47. package/test/chat-history-nocolor.test.js +155 -0
  48. package/test/chat-relogin.test.js +207 -0
  49. package/test/defer-detail-band.test.js +403 -0
  50. package/test/detail-band-tab-flatten.test.js +242 -0
  51. package/test/exec-diff.test.js +268 -0
  52. package/test/executors.test.js +250 -13
  53. package/test/extract-tool-calls.test.js +37 -3
  54. package/test/file-activity.test.js +522 -0
  55. package/test/grep-path-target.test.js +227 -0
  56. package/test/harness/chat-harness.js +2 -1
  57. package/test/headless.test.js +146 -1
  58. package/test/input-field-ctrl-o.test.js +37 -0
  59. package/test/live-height-physical.test.js +281 -0
  60. package/test/max-iterations.test.js +9 -7
  61. package/test/md-stream.test.js +183 -0
  62. package/test/native-dispatch.test.js +53 -0
  63. package/test/native-live-narration.test.js +254 -0
  64. package/test/output-heredoc-leak.test.js +195 -0
  65. package/test/output-preview.test.js +245 -0
  66. package/test/permissions.test.js +199 -0
  67. package/test/read-paginate.test.js +1 -1
  68. package/test/render-operation.test.js +317 -0
  69. package/test/replay-descriptor-xml.test.js +216 -0
  70. package/test/replay-descriptor.test.js +189 -0
  71. package/test/replay-web-aggregate.test.js +291 -0
  72. package/test/replay-web-persist.test.js +241 -0
  73. package/test/running-glyph-anim.test.js +111 -0
  74. package/test/status-bar-driver.test.js +93 -0
  75. package/test/status-bar-resync.test.js +188 -0
  76. package/test/stream-parser.test.js +24 -0
  77. package/test/theme-palette.test.js +166 -0
  78. package/test/truncate-visible.test.js +78 -0
  79. package/test/view-image.test.js +199 -0
  80. package/test/web-activity-ordering.test.js +12 -3
  81. package/path +0 -1
@@ -0,0 +1,245 @@
1
+ # semalt-code — History, Decisions & Rationale
2
+
3
+ > Dependency-policy rationale, the full "Key Patterns & Invariants" reference,
4
+ > and the "Deferred / Not Yet Implemented" roadmap. **Not auto-loaded** as project
5
+ > memory. The lean `CLAUDE.md` carries the compressed, verified invariant set;
6
+ > this file preserves the long-form rationale and the per-task history.
7
+
8
+ > Per-task (Task X.Y) rationale and the "Tested by …" enumerations live inline in
9
+ > `docs/ARCHITECTURE.md` alongside each subsystem they describe.
10
+
11
+ ---
12
+
13
+ ## Dependency & Supply-Chain Policy (Task 3.2)
14
+
15
+ The project ran **zero runtime dependencies** through Phase 2. Adopting the official
16
+ MCP SDK (`@modelcontextprotocol/sdk`) in v1.9.0 ends that era. The invariant is now
17
+ **minimal, vetted, pinned dependencies** — not "no dependencies."
18
+
19
+ **When a runtime dependency is allowed.** Every new runtime dependency must be:
20
+
21
+ 1. **Minimal** — preferred only when a Node.js built-in genuinely cannot do the job.
22
+ The bar for the *first* dependency was high on purpose; the bar for the next one
23
+ is the same. Dev-only tooling is still avoided (we lint with `node --check` and
24
+ test with `node:test`).
25
+ 2. **Justified** — a one-line rationale recorded here (see below) and in the PR.
26
+ 3. **Pinned to an exact version** — no `^`/`~`/ranges in `package.json`. Upgrades are
27
+ deliberate, reviewed commits, never silent on `npm install`.
28
+ 4. **Reviewed** — adding/bumping a dependency is a reviewed change, and the
29
+ regenerated `package-lock.json` is committed in the same PR.
30
+
31
+ **Rationale for the web-extraction deps (Task W.1, all pinned exact).** The
32
+ web-fetch pipeline (see **Web Fetch Pipeline** below) turns raw HTML into
33
+ main-content Markdown — reliably parsing real-world malformed HTML, scoring the
34
+ main article over chrome, and emitting clean Markdown are each large, bug-prone
35
+ surfaces where a hand-rolled regex approach is exactly the wrong call (quality is
36
+ the whole point). The chosen libraries are the reference implementations:
37
+ - **`@mozilla/readability` (`0.6.0`)** — Firefox Reader View's extractor; the
38
+ de-facto standard for "main content of a page." MIT. **Zero transitive deps.**
39
+ - **`turndown` (`7.2.4`)** — the reference HTML→Markdown converter. MIT. One
40
+ transitive dep (`@mixmark-io/domino`, a DOM impl).
41
+ - **`linkedom` (`0.18.12`)** — a light DOM for Readability to operate on
42
+ (`jsdom` is far heavier and unnecessary here). MIT. Transitive footprint:
43
+ `css-select`, `css-what`, `boolbase`, `nth-check`, `domhandler`,
44
+ `domelementtype`, `domutils`, `dom-serializer`, `entities`, `cssom`,
45
+ `htmlparser2`, `html-escaper`, `uhyphen` (`canvas` is an *optional* dep, left
46
+ uninstalled). **Total added: ~18 packages, `npm audit` clean (0 advisories).**
47
+ All three are loaded directly (CommonJS-compatible) from `lib/web-extract.js` —
48
+ no ESM boundary needed (unlike the MCP SDK).
49
+
50
+ **Rationale for `@modelcontextprotocol/sdk` (pinned `1.29.0`).** MCP is an open
51
+ protocol with a non-trivial wire contract (JSON-RPC framing, capability negotiation,
52
+ transport lifecycle, schema validation). Reimplementing it by hand would be a large,
53
+ bug-prone surface to own and keep in spec. The **official** SDK is the reference
54
+ implementation, MIT-licensed, and tracks the spec — exactly the case where a vetted
55
+ dependency beats a built-in reimplementation. It is the foundation Task 3.3 builds the
56
+ MCP client on.
57
+
58
+ **ESM/CJS boundary.** The SDK is **ESM-only** (`"type": "module"`); this project is
59
+ CommonJS. A CJS module cannot `require()` an ESM-only package. The entire codebase
60
+ stays CommonJS — the SDK is loaded in exactly one place, `lib/mcp/boundary.js`, via
61
+ dynamic `import()`, which re-exposes a CJS-friendly async surface (`loadSdk`,
62
+ `createClient`, `createStdioTransport`). No other module imports the SDK directly.
63
+ See **MCP Boundary** below.
64
+
65
+ **Lockfile + CI guardrails.** `package-lock.json` is committed. CI (`.github/workflows/ci.yml`) runs:
66
+ - `npm ci` — installs strictly from the lockfile; fails on package.json↔lockfile drift (integrity).
67
+ - `npm audit --omit=dev --audit-level=high` — fails the build on a **HIGH or CRITICAL**
68
+ advisory in the **runtime** (production) dependency tree. Dev deps are excluded
69
+ (there are none today).
70
+
71
+ **Audit-findings policy.** When `npm audit` flags an advisory:
72
+
73
+ - **Critical / High** → **blocking.** CI fails. Resolve before merge by bumping to a
74
+ patched pinned version (regenerate + commit the lockfile), or — if no fix exists —
75
+ removing/replacing the dependency. A temporary, time-boxed exception requires an
76
+ explicit `npm audit` allow-list entry **with a written justification and a tracking
77
+ issue**; it is not the default.
78
+ - **Moderate / Low** → **non-blocking** (the `--audit-level=high` gate lets them pass)
79
+ but **tracked**: open an issue and address on the next dependency-maintenance pass.
80
+ Do not raise the gate to fail on these without agreement — noisy gates get ignored.
81
+ - **Routine maintenance** → periodically run `npm audit` and `npm outdated`; dependency
82
+ bumps follow the pinning + review rules above.
83
+
84
+ ---
85
+
86
+
87
+ ## Key Patterns & Invariants
88
+
89
+ - **Minimal, pinned dependencies**: prefer Node.js built-ins; a runtime dependency must be minimal, justified, pinned to an exact version, and reviewed (see **Dependency & Supply-Chain Policy**). Today: `@modelcontextprotocol/sdk` (MCP) and the web-extraction set `@mozilla/readability` + `linkedom` + `turndown` (Task W.1).
90
+ - **CommonJS**: all files use `require()`/`module.exports`. Do not use ES `import`/`export`. The one exception is the **dynamic** `import()` inside `lib/mcp/boundary.js`, which is the sole bridge to the ESM-only MCP SDK — the project itself stays CommonJS.
91
+ - **Streaming**: `api.js` manually parses `text/event-stream`. The parser in `chatStream()` handles partial JSON lines — be careful editing it.
92
+ - **Permissions are per-session**: `PermissionManager` resets on each CLI invocation. Approvals never persist to disk. In non-TTY mode tool calls that would normally need interactive confirmation are **refused** (not auto-approved) unless `--dangerously-skip-permissions` is set, or the tag is pre-approved by an `--allow-*` tier flag.
93
+ - **Destructive-command deny-list** (`lib/deny.js`): every shell call (`exec`/`shell`) passes through `classifyShellCommand()` at the single chokepoint in `agentExecShell`, in *all* modes and regardless of `--allow-*` flags. Handling depends on the **initiator**:
94
+ - **Agent-initiated** (the model asked, the default): any deny-list hit is a **hard block** — `rm -rf`, `curl … | sh`, disk-wipe/fork-bomb patterns, recursive chmod/chown on a system root, and writes to system paths.
95
+ - **User-initiated** (a human typed `!cmd` or `semalt-code shell`): the user owns their machine, so a deny-list hit is **not** hard-blocked. The exception is the **catastrophic subset** (`catastrophic: true` — disk-wipe / block-device write, fork bomb), which interposes a single y/N confirmation as a typo guard; all other deny-listed user commands run with a `bypassed` note.
96
+ - The only full bypass (skips classification entirely) is `--dangerously-skip-permissions`.
97
+ - **Cross-platform + canonicalized (Task 4.4):** the list now covers the
98
+ **Windows** destructive set (`del /s`, `rd`/`rmdir /s`, `Remove-Item -Recurse
99
+ -Force`, `format`, `Format-Volume`, `Clear-Disk`, `cipher /w`, `diskpart …
100
+ clean`) in addition to POSIX — relevant because native Windows has no OS
101
+ sandbox. Matching also runs against a **procfs-root-canonicalized** variant
102
+ (`/proc/self/root` and `/proc/<pid>/root` rewritten to `/`) so a
103
+ `/proc/self/root/etc/…` bypass is caught by the same system-path matchers
104
+ (the resolved-path principle, shared with the OS sandbox).
105
+ - **Untrusted web content**: `http_get` runs the **web-fetch pipeline** (Task W.1 / W.1b, `mode` = summarized→extract→Markdown→secondary-LLM summary / extracted→Markdown / raw→original token-capped content) so by default only a compact result enters context (`raw` mode deliberately returns the original markup, still **token-capped**, for page analysis); the result in **every** mode is wrapped in the explicit `<<<UNTRUSTED_EXTERNAL_CONTENT>>>` block (`lib/agent.js`), and the secondary summarizer treats the page as data-only (a page injection could have steered it). The system prompt (`lib/prompts.js`) instructs the model never to act on instructions inside such a block. MCP tool results and **lifecycle-hook output** reuse the same fence. See **Web Fetch Pipeline**.
106
+ - **Lifecycle hooks are deny-listed + sandboxed shell + untrusted output** (`lib/hooks.js`): a `PreToolUse` non-zero exit blocks the tool; every hook command passes through `checkShellDenylist` AND the **OS sandbox** (`resolveSandboxedSpawn`, Pre-Task 5.0a) before running; hook stdout is fenced as untrusted before it reaches the model; timeouts/sandbox-refusals/failures are contained and never crash the loop. **Project-layer command hooks and `verify.command` are quarantined** (`loadHookLayers`/`loadVerifyLayers`): a cloned-repo `.semalt/config.json` can never introduce host-privileged execution, only inert prompt text.
107
+ - **`--readonly` blocks every file-mutating tool** (`READONLY_BLOCKED`, `lib/permissions.js`, completed in Pre-Task 5.0c): `write_file`, `append_file`, `edit_file`, `replace_in_file`, `delete_file`, `make_dir`, `remove_dir`, `move_file`, `copy_file`, `upload`, `download`. The block is enforced at the executor (`permissionManager.readonlyBlock(tag)`), so it holds for both the XML and native paths; `describePermission` also short-circuits the gate (no approval prompt precedes the deterministic block). **Scope decision (load-bearing): `--readonly` governs FILE TOOLS only.** Shell (`exec`/`shell`) is **not** in the set — a read-only session must still run read-only commands (`ls`, `git status`), and a shell command's arbitrary write side effects are the **OS sandbox + deny-list's** job to confine (the right layer post-Pre-Task 5.0a), not `--readonly`. So `--readonly` is an honest "no file-tool writes," not a false "no writes at all." Read-only file tools (`read_file`, `grep`, `glob`, `search_in_file`, `file_stat`, `list_dir`) work unchanged. Tested by `test/readonly-tools.test.js`.
108
+ - **Secret-file read guard**: `isProtectedSecretPath()` in `tools.js` refuses reads/copies/moves of `config.json`, `memory.json`, and `audit.log` via file tools — **not** overridable by `--allow-anywhere` (only by `--dangerously-skip-permissions`).
109
+ - **Config-write guard** (`isProtectedConfigPath()` in `tools.js`, Pre-Task 5.0b): the write-side companion to the read guard. Every write executor (`write_file`, `append_file`, `edit_file`, `replace_in_file`, `move_file`/`copy_file` **dst**, `upload`, `download`) refuses to write into the **protected-config set** — the whole `~/.semalt-ai` dir **and** every project `.semalt` dir from the CWD up to the repo root, **including files that do not yet exist** (directory-prefix matched on the resolved path, so a missing `.semalt/config.json`/`agents/*.md`/hook is covered). The set is defined once as `protectedConfigDirs` (`lib/constants.js`) and shared with the OS sandbox's `protectedPaths`. Same bypass policy as the read guard: **not** overridable by `--allow-anywhere`, only by `--dangerously-skip-permissions` (human-only). This guards the **agent's** file tools and the sandboxed shell — a human editing their own config in an editor is unaffected. Tested by `test/config-write-guard*.test.js`, `test/path-guards.test.js`, and the kernel case in `test/sandbox-integration.test.js`.
110
+ - **Per-pattern permission rules** (`lib/permission-rules.js`, Task 4.1): allow/deny/ask rules matching tool + argument (glob/regex), layered user→project. **Project rules can only NARROW** — every project `allow` is structurally dropped before resolution, so a cloned-repo `.semalt/config.json` can never widen the user posture. Precedence is total/deterministic (deny>ask>allow, most-specific then most-restrictive). Arguments are canonicalized (`..`/symlink/abs-rel) before matching; pathological/malformed rules fail closed; an `allow` never bypasses the deny-list, secret guard, `--readonly`, or `isPathSafe` (those stay in the executors). A `deny` rule holds even under `--dangerously-skip-permissions`. See **Per-Pattern Permissions** above.
111
+ - **Checkpoints & rewind** (`lib/checkpoints.js`, Task 4.3 / 4.3b): before each file-tool mutation the file's prior state is snapshotted (post-gate, pre-mutation, in `agentExecFile`) so `/rewind` can restore it — **file-tool changes only; shell side effects are not reversible.** Capture is fail-safe (a snapshot failure never blocks the mutation); a denied/withheld call produces no checkpoint; subagent mutations are checkpointed into the parent session. Delete/move are reversed explicitly; an external-modification check warns/asks before clobbering out-of-band edits. A per-file size cap and per-session retention are enforced. **Rewind is human-only (no rewind tool in the registry).** Task 4.3b: the restore path **re-validates the current guards** (`isPathSafe`/secret/protected-config/`deny` rule) per target — a now-forbidden path is refused/skipped, and `force` overrides only the external-mod check, not the guards; **three restore modes** `code`/`conversation`/`both` (default both) restore files, history, or the linked state, with conversation truncation cutting on **turn boundaries** (no orphaned `tool_call`; discard policy) — all on the **unchanged** on-disk schema. See **Checkpoints & Rewind** above.
112
+ - **Native git tools** (`lib/tool_registry.js`, Task 5.1): eight first-class git tools shelling out through the **same** `agentExecShell` sandbox + deny-list chokepoint as `<shell>` (no privileged path around confinement), parsing output into structured results. Read-only (`git_status`/`git_diff`/`git_log`, plus the *list* ops of `git_branch`/`git_worktree`) return a null permission descriptor; mutating (`git_add`/`git_commit`/`git_branch`/`git_checkout`/`git_worktree` add/remove) require approval, honor `--readonly`, and pass the per-pattern rules. `git_commit` requires a real non-empty message (empty → error, never a placeholder). **Destructive-git ↔ checkpoint honesty:** git operations are NOT reversible via `/rewind` (checkpoints snapshot file-tool mutations only) — stated in the descriptions and prompt text. Not-a-repo / git-absent degrade gracefully. See **Native Git Tools** above.
113
+ - **API-key sourcing** (`lib/secrets.js`): precedence is `SEMALT_API_KEY` env → OS keychain (macOS `security` / Linux `secret-tool` / Windows PasswordVault) → `config.json`. Keys from env/keychain are never written back to config; `configShow` reports only `api_key_source`. Store a key with `semalt-code auth set-key`.
114
+ - **Token counting is approximate**: `estimateTokens()` divides char count by 4. It is used only for the `/compact` display — do not rely on it for hard limits.
115
+ - **Context trimming is proactive when a limit is known**: `chatStream()` uses the in-process `_sessionInputLimits` learned from a prior 400 overflow first, then falls back to `config.context_length * 0.9`. When neither is set, no pre-flight trim runs and the client relies on the reactive 400/413 handler (which then persists the discovered window). `Metrics.tokenLimitStatus()` returns `{ used, limit: null }` until a limit is learned, so the status bar shows "N tok · limit unknown" instead of hiding the line.
116
+ - **Shell/exec output entering context is bounded** (Task W.6, `capShellOutput` in `lib/agent.js`): the model-facing shell result is double-bounded — a **head+tail line cap** (`max_output_lines`, default 50, split first ~60% + last ~40% via `OUTPUT_HEAD_RATIO`) eliding the middle, **then** a **token safety net** (`max_output_tokens`, default 10000, reusing the web pipeline's `capToTokens`) so a few enormous lines (minified JS, a binary `cat`) can't blow context. The elision notice teaches the W.5-enabled redirect-to-file→grep pattern. **The exit code stays on its own line, so truncating output VOLUME never hides the command's OUTCOME** (a non-zero exit / failure is always surfaced). Applied at the context boundary in the agent loop — distinct from the **UI** cap (`lib/ui/diff.js`, display only), which stays. Before W.6 the cap was UI-only and the model received the **entire** unbounded stdout+stderr (the #1 context risk). Pure helper, unit-tested on the model-facing text + a real-loop assertion (`test/shell-output-cap.test.js`). MCP/subagent output bounding is Task W.8 (below); W.9 unifies all the paths into a shared chokepoint.
117
+ - **MCP & subagent results entering context are bounded** (Task W.8, `formatMcpResult`/`formatSubagentResult` in `lib/agent.js`): the last two unbounded paths. Both apply `capToTokens` (the W.5–W.7 standard) to the result text **before** wrapping it in the `<<<UNTRUSTED_EXTERNAL_CONTENT>>>` fence, with **distinct budgets reflecting their nature**: **MCP is stricter** (`mcp.max_result_tokens`, default **10000**) because the payload size is third-party/server-controlled and untrusted — the riskiest path; **subagent is generous** (`subagents.max_result_tokens`, default **20000**) because the child's final text is our own deliberate, synthesized answer (a safety net against a verbose child). For MCP the truncation notice sits **inside** the fence with the capped content — capping never weakens the untrusted perimeter; subagent isolation / no-escalation (3.6/4.5) are unchanged (this bounds returned-text size only). A small result passes through fully, no notice. Pure helpers, unit-tested on the model-facing/parent-facing text incl. the fence-still-present and budgets-differ cases + real-loop assertions (`test/result-cap.test.js`).
118
+ - **`read_file` is paginated** (Task W.7, `formatReadResult` in `lib/agent.js`): `read_file` used to dump the **whole file verbatim** into context (`File <path>:\n` + the entire content); the only guard was a hard byte refusal at `max_file_size_kb`. Worst case ~128k tokens for a 500 KB file. Now the **model-facing** result is paginated, mirroring the Claude Code standard: under a **line cap** (`read_line_cap`, default **2000**) the file reads **byte-for-byte as before** (no regression for the common small-file case); over the cap it returns the first page + a **`[PARTIAL]` notice** — `Showing lines 1–2000 of 5234. Read more with start_line=2001.` **`start_line`/`end_line`** (on both XML + native rails; absent → null, tuple parity) read an explicit slice, **also line-capped** so a huge explicit range can't dump everything. A **token safety net** (`read_max_tokens`, default **25000**, reusing the web pipeline's `capToTokens`) bounds the pathological few-but-enormous-lines case (one 100 KB minified line) the line cap misses — consistent with W.6's double-bound. The bound is applied at the **context boundary** in the formatter (the executor still returns the full content, like W.5/W.6); pagination — not the byte cap — is the primary bound, so `max_file_size_kb` is now a **backstop** (raised default **50 MB**) ruling out a multi-GB whole-file slurp (lower it to hard-refuse smaller files). **Line numbers are OPTIONAL, default OFF** (`show_line_numbers`): the **Step 0 finding** is that `edit_file` is **line-number-based** (`lines[N-1]=content`) while `replace_in_file` is **match-based** (regex on a search string) — a mix — so always-on numbers would corrupt copyable snippets for the match path **and** cost ~1.7× per read; the param turns absolute 1-based numbers on (aligned with `edit_file`'s addressing) for when the agent wants line refs to drive an edit. Line indexing matches `edit_file`'s `split('\n')` exactly, so the read→edit loop stays aligned. Pure helper, unit-tested on the model-facing text incl. the no-regression small-file case + the PARTIAL large-file case + rail parity + read→edit alignment (`test/read-paginate.test.js`).
119
+ - **grep/glob results are serialized + bounded** (Task W.5, `formatGrepResult`/`formatGlobResult` in `lib/agent.js`): `formatFileResult` now has `case 'grep'`/`case 'glob'` that turn the structured engine result into model-facing text — closing a correctness bug where both fell through the default and the model received `"grep: done"`/`"glob: done"` (the data was computed and even shown in the UI, but never entered context, making grep-first navigation impossible). grep `output_mode` (`content`/`files_with_matches`/`count`) is model-selectable via the spec; `head_limit` (default `DEFAULT_GREP_HEAD_LIMIT`/`DEFAULT_GLOB_HEAD_LIMIT` = 100) + optional `offset` bound what reaches the model — the engine's 1000/5000 internal caps were never a context bound (the result was dropped before it reached context). Over-limit serialization carries a truncation notice telling the agent how to narrow (refine the pattern, switch to `count`/`files_with_matches`, or raise `head_limit`); under-limit results show fully with no notice. The executors (`lib/tool_registry.js`) normalize and attach `output_mode`/`head_limit`/`offset` onto the result; the serializers are pure and tested on the **model-facing** text (`test/grep-glob-serialize.test.js`, incl. the real-loop regression).
120
+ - **Tool output enters context ONLY via the `boundToolOutput` chokepoint** (Task W.9, `lib/agent.js`): the size analogue of the `resolveSandboxedSpawn` sandbox chokepoint. W.5–W.8 each bounded a previously-unbounded path, but the `capToTokens`-+-fence step was duplicated ad-hoc in five places — the original bugs (grep/glob `"done"`, shell/MCP/subagent unbounded) were all the **same class**: a path that put output into context without bounding it. `boundToolOutput(text, { budget, notice, fenced })` is the **single application point**: it applies `capToTokens` with the path's **budget** and **notice** function and (when `fenced`) wraps in the `<<<UNTRUSTED_EXTERNAL_CONTENT>>>` fence. **grep/glob, shell, read_file, MCP, subagent — and http_get/web_search — all route through it.** The per-path policy is **deliberately distinct and NOT flattened**: budgets (MCP 10k < subagent 20k < read 25k; shell 10k; grep/glob `DEFAULT_GREP_GLOB_MAX_TOKENS` 10k — a new token net so a few huge minified match lines can't blow context, the W.6 lesson applied to grep's count-bound), notice wording (shell teaches redirect→grep, read teaches narrow-the-range, …), and the fence flag (MCP/subagent/web fenced; file/shell not). **Refactor-safe:** model-facing outputs are byte-identical to W.5–W.8 (the W.5–W.8 test suites pass unchanged); http_get/web_search bodies are already token-capped upstream so they pass **no budget** (fence only). **Structural regression prevention:** a new tool gets bounding by *routing* its output through the chokepoint, not by *remembering* to cap. Pure helper, unit-tested on the chokepoint behavior, per-path policy, the bound-by-construction invariant, and equivalence (`test/output-chokepoint.test.js`). The system prompt's `LOCAL_NAVIGATION_NOTICE` (`lib/prompts.js`, both templates) — now actionable post-W.5 — steers the grep-first / read-slice pattern: locate with `grep`/`glob` (`count`/`files_with_matches` modes), then `read_file` only the relevant `start_line`/`end_line` slice; redirect large command output to a file and grep it.
121
+ - **Bounded agent iterations**: the primary loop caps at `config.max_iterations` (default 125, via `DEFAULT_MAX_ITERATIONS` in `constants.js`), overridable with `--max-iterations <n>`; `--max-iterations 0`/`"unlimited"` removes the cap deliberately. Reaching the cap stops gracefully (clear message + `stopReason: "max_iterations"`), never silently. Subagents have their own cap of 12.
122
+ - **Malformed tags are skipped**: each tool dispatch in the agent loop is wrapped in try/catch; errors emit a warning line and continue to the next tool call.
123
+
124
+ ---
125
+
126
+ ## Deferred / Not Yet Implemented
127
+
128
+ This section exists because false documentation has burned this project before (a
129
+ "max 10 iterations" invariant that never existed; coverage assumed but absent). The
130
+ items below are things a reader might reasonably expect from the docs or from peer
131
+ tools but that the code **does not do today**. They are listed honestly so nobody
132
+ builds on a feature that isn't there. Each is marked **Planned (Phase 4+)** —
133
+ on the roadmap — or **Out of scope** — no current plan.
134
+
135
+ **Gaps the re-audit found in existing behavior:**
136
+
137
+ - **MCP in headless / one-shot** — *Planned (Phase 4+).* `connectAll()` runs only in
138
+ interactive `cmdChat` (and the `mcp` management commands); `code`/`edit`/`shell`/`-p`
139
+ never connect a manager, so MCP tools are unavailable there. See **MCP Client → Scope**.
140
+ - **Session auto-resume** — *Planned (Phase 4+).* Sessions are saved, but there is no
141
+ startup prompt offering to resume the most recent (< 24 h) session. Resume is always
142
+ explicit: `/history` (local) or `--resume <id>` (dashboard). See **Session Storage**.
143
+ - **Corporate-proxy consumption** — *Planned (Phase 4+).* `HTTPS_PROXY`/`HTTP_PROXY`
144
+ are parsed into config but `api.js` does not route requests through a proxy agent,
145
+ so they have no effect on outbound HTTP. See **Config hierarchy → Environment**.
146
+
147
+ **Phase 4 roadmap (Planned, in the stated order):**
148
+
149
+ - **Per-pattern permissions** — ✅ **Done (Task 4.1).** Rich allow/deny/ask rules
150
+ matching tool + argument (glob/regex), layered user→project. See **Per-Pattern
151
+ Permissions** above.
152
+ - **Self-verification** — ✅ **Done (Task 4.2).** When the agent declares done,
153
+ optionally run a configured verify command (advisory feeds the result back;
154
+ enforcing returns the agent to the loop until verify passes, bounded by
155
+ `max_attempts` → `verify_failed`). See **Self-Verification** above.
156
+ - **Checkpoints / rewind** — ✅ **Done (Task 4.3 file half + Task 4.3b
157
+ conversation + restore re-validation).** Per-write file snapshots before each
158
+ file-tool mutation; `/rewind` restores prior content (last or to a chosen
159
+ sequence), with delete/move handled and an external-modification check that never
160
+ silently clobbers out-of-band edits. **File-tool changes only — shell side
161
+ effects are not reversible.** Task 4.3b closed the last deferred 4.3 security
162
+ finding (the restore path now **re-validates the current
163
+ isPathSafe/secret/protected-config/`deny`-rule guards** per target — `force`
164
+ overrides only the external-mod check) and added **three restore modes**
165
+ (`code`/`conversation`/`both`, default both) using the existing turn-linkage,
166
+ with conversation truncation cutting on **turn boundaries** (no orphaned
167
+ `tool_call`; discard policy) on the **unchanged** on-disk schema. Rewind stays
168
+ **human-only** (no rewind tool registered). See **Checkpoints & Rewind** above.
169
+ - **OS sandbox** — ✅ **Done (Task 4.4 filesystem + Task 4.4b network).** Real
170
+ OS-level confinement for shell commands: Seatbelt (macOS) / bubblewrap
171
+ (Linux/WSL2) jail every command and its children, confining writes to the working
172
+ dir and keeping `~/.semalt-ai`/secrets/`/etc` read-only (incl. not-yet-existing
173
+ files), with a fail-safe ask-or-block fallback when the primitive is absent and no
174
+ model-reachable way to disable it. **Network isolation is now done as well —
175
+ binary on/off** (bwrap `--unshare-net` / Seatbelt `(deny network*)`), no host
176
+ proxy / no domain allowlist / no TLS interception, anti-fail-open default. See
177
+ **OS Sandbox** above.
178
+
179
+ **Done since:**
180
+
181
+ - **Native git tooling** — ✅ **Done (Task 5.1).** Eight first-class git tools
182
+ (`git_status`/`git_diff`/`git_log` read-only; `git_add`/`git_commit`/`git_branch`/
183
+ `git_checkout` mutating; `git_worktree` infrastructure) shelling out through the
184
+ sandbox + deny-list chokepoint with structured results. The long tail stays in the
185
+ generic shell. See **Native Git Tools** above.
186
+ - **Embedding SDK** — ✅ **Done (Task 5.2).** Two-tier library surface separated by
187
+ `package.json` `exports`: the stable `createAgent` facade (main entry) and the
188
+ unstable building blocks (`/internals`). Programmatic permission policy that
189
+ defaults to refusing mutations; sandbox/deny-list stay on with explicit opt-out;
190
+ `close()` teardown; per-instance config (process-global limits documented). See
191
+ **Embedding SDK** above.
192
+ - **Background tasks** — ✅ **Done (Task 5.3).** `run --background` launches a
193
+ detached agent process (own process = own global state, reusing the
194
+ `createAgent` facade) with a launch-fixed, refuse-by-default policy and
195
+ sandbox/deny-list on; a file-based task registry (`~/.semalt-ai/tasks/`) drives
196
+ `tasks list|status|result|kill|prune`. Validation runs before detach (no
197
+ orphans); stale/dead tasks are detectable and prunable; kill tree-kills by PID.
198
+ Background-launch is intentionally NOT an agent tool. See **Background Tasks**
199
+ above.
200
+ - **Multimodal image input** — ✅ **Done (Task 5.4).** PNG/JPEG/WebP/GIF attach via
201
+ `--image` (repeatable), in-chat `/image`, and the SDK `images` option; read
202
+ through `isPathSafe`, size-capped (`image_max_bytes`), base64-encoded, media
203
+ type detected from magic bytes. The provider content-part shape (Anthropic-style
204
+ vs OpenAI-style) is selected per profile/heuristic; a text-only model fails loud
205
+ (the image is never silently dropped). PDF input deferred; generation out of
206
+ scope. See **Multimodal Image Input** above.
207
+
208
+ **Planned, not yet scheduled:**
209
+
210
+ - **Cost caps** — hard spend limits per session/turn (today cost is *displayed* via
211
+ `lib/pricing.js`, never enforced).
212
+ - **Auto-update** — self-updating the CLI (today: `npm install -g` manually).
213
+ - **XDG / `%APPDATA%` config dirs** — honoring platform config-dir conventions instead
214
+ of the fixed `~/.semalt-ai/`.
215
+ - **Domain-allowlist network policy** — *deliberately deferred, may stay out of
216
+ scope.* Task 4.4b ships **binary** network isolation (on / kernel-level none); a
217
+ per-domain allowlist ("allow github.com, block the rest") is **not** implemented
218
+ and is **not** a planned increment by default. **Rationale:** domain-granularity
219
+ requires a host-side egress proxy with full network privileges, which is the
220
+ exact design the reference implementation shipped and that was **bypassed
221
+ completely, twice, over 5.5 months** (allowedDomains fail-open CVE-2025-66479, a
222
+ hostname-parser differential, and TLS-MITM breaking Go binaries). We will only
223
+ revisit this if it can be done **without** a host proxy / TLS interception (e.g.
224
+ a kernel/eBPF egress filter on resolved IPs) — until then, binary isolation is
225
+ the robust posture. See **OS Sandbox → Why binary**.
226
+ - **Native-Windows / WSL1 sandbox** — no OS primitive today (bwrap needs the
227
+ user/mount namespaces WSL1 lacks; native Windows has none). On those platforms
228
+ the sandbox degrades to the fail-safe fallback (ask-or-block); the Windows
229
+ deny-list (now covered, Task 4.4) is the remaining shell guard there.
230
+
231
+ **Out of scope (no current plan):**
232
+
233
+ - **Multimodal — image *input*** is ✅ **Done (Task 5.4)** — PNG/JPEG/WebP/GIF
234
+ attached via `--image` / `/image` / the SDK `images` option, sent provider-
235
+ specifically to vision models (text-only models fail loud). See **Multimodal
236
+ Image Input** above. Still out of scope: **PDF input** (deferred), **audio
237
+ input**, and **image/audio *generation* / output**.
238
+ - **Background / cloud / scheduling** — long-running background agents, cloud execution,
239
+ or cron-style scheduling.
240
+ - **OpenTelemetry** — OTel traces/metrics export.
241
+ - **Managed policy** — centrally-administered org policy enforcement.
242
+ - **Native notifications** — OS-level desktop notifications.
243
+
244
+ ---
245
+
package/index.js CHANGED
@@ -289,7 +289,7 @@ Options:
289
289
  no host proxy, no domain allowlist, no TLS interception.
290
290
  Same effect as sandbox.network "off" in config.
291
291
  --readonly Block all write operations
292
- --max-iterations <n> Cap agent-loop iterations per turn (default 50);
292
+ --max-iterations <n> Cap agent-loop iterations per turn (default 125);
293
293
  0 or "unlimited" removes the cap (power-user choice)
294
294
  --no-verify Skip self-verification (config.verify) for this run
295
295
  --dangerously-skip-permissions
package/lib/agent.js CHANGED
@@ -3,7 +3,7 @@
3
3
  const { logToolCall } = require('./audit');
4
4
  const { Metrics } = require('./metrics');
5
5
  const { getSystemPrompt, getPlanModeNotice } = require('./prompts');
6
- const { isNativeToolsActive } = require('./config');
6
+ const { isNativeToolsActive, getInlineReasoning } = require('./config');
7
7
  const { TAG_REGISTRY, DEFAULT_MAX_ITERATIONS, DEFAULT_GREP_HEAD_LIMIT, DEFAULT_GLOB_HEAD_LIMIT, DEFAULT_GREP_GLOB_MAX_TOKENS, DEFAULT_MAX_OUTPUT_LINES, OUTPUT_HEAD_RATIO, DEFAULT_OUTPUT_MAX_TOKENS, DEFAULT_READ_LINE_CAP, DEFAULT_READ_MAX_TOKENS, DEFAULT_MCP_MAX_RESULT_TOKENS, DEFAULT_SUBAGENT_MAX_RESULT_TOKENS } = require('./constants');
8
8
  const { capToTokens, defaultEstimate, DEFAULT_CHARS_PER_TOKEN } = require('./web-extract');
9
9
  const { mapInvokeToCall } = require('./tools');
@@ -342,6 +342,19 @@ function truncateForDebug(text, maxLines = 40, maxChars = 2000) {
342
342
  // layer (commands.js) feeds the meta into formatToolLine together with
343
343
  // the tag, so the formatter can produce the 4-segment line in either the
344
344
  // pending (live region) or final (scrollback) context.
345
+ // Phase 6a — build one native `{role:'tool'}` result message. `content` is the
346
+ // model-facing bound result string, kept BYTE-IDENTICAL (Inv. 1). A serialized
347
+ // display descriptor core (from onToolEnd), when present, rides along as a
348
+ // sibling `_display` key — additive only, never part of `content`, and stripped
349
+ // before the wire (see api.js) so it is never fed to the model. Replay
350
+ // (chat-history.js) reads `_display` to render with full fidelity; its absence
351
+ // falls back to the legacy summary.
352
+ function _nativeToolMessage(toolCallId, content, displayCore) {
353
+ const msg = { role: 'tool', tool_call_id: toolCallId, content };
354
+ if (displayCore) msg._display = displayCore;
355
+ return msg;
356
+ }
357
+
345
358
  function _metaForTool(tag, result) {
346
359
  if (!result || result.error) return null;
347
360
  switch (tag) {
@@ -383,6 +396,11 @@ function _metaForTool(tag, result) {
383
396
  bytes: result.size_kb ? Math.round(parseFloat(result.size_kb) * 1024) : 0,
384
397
  kind: result.type || null,
385
398
  };
399
+ case 'ask_user':
400
+ // Surface the user's chosen answer as display meta so the committed result
401
+ // line reads "✓ user · ask <question> · → <answer>". Display-only: the
402
+ // model-facing string (formatFileResult) still uses the full question.
403
+ return { answer: result.answer };
386
404
  default:
387
405
  return null;
388
406
  }
@@ -402,6 +420,7 @@ function _attrsFromCall(call) {
402
420
  return { command: args[0] || '' };
403
421
  case 'read':
404
422
  case 'read_file':
423
+ case 'view_image':
405
424
  case 'list_dir':
406
425
  case 'delete_file':
407
426
  case 'make_dir':
@@ -819,6 +838,15 @@ function createAgentRunner({ chatStream, extractToolCalls, agentExecShell, agent
819
838
  maxTokens: cfg.read_max_tokens,
820
839
  });
821
840
  }
841
+ case 'view_image':
842
+ // The encoded image rides on result.image and is attached to this turn's
843
+ // tool-result message by the loop below; the model-facing text is just a
844
+ // short confirmation. Wording is deliberate: the image is visible to the
845
+ // MODEL for analysis, NOT shown to the user — so the model must not refer
846
+ // to it as something the user can see.
847
+ return `Image ${result.path} (${result.media_type}, ${result.bytes} bytes) is now attached to your `
848
+ + `vision context — analyze it directly. It was made visible to YOU (the model) for analysis; it was `
849
+ + `NOT displayed to the user, so do not refer to it as something the user can see.`;
822
850
  case 'write':
823
851
  return `Wrote ${result.bytes} bytes to ${args[0]}`;
824
852
  case 'append':
@@ -982,6 +1010,11 @@ function createAgentRunner({ chatStream, extractToolCalls, agentExecShell, agent
982
1010
  };
983
1011
 
984
1012
  const nativeTools = isNativeToolsActive(model);
1013
+ // Live-narration safety signal (b): an explicit per-profile assertion that
1014
+ // this model does NOT inline reasoning into delta.content. Only an explicit
1015
+ // `false` is the eager-stream signal; undefined/true keep the safe buffered
1016
+ // fallback. Threaded to the UI gate via onStreamStart alongside nativeTools.
1017
+ const inlineReasoning = getInlineReasoning(model);
985
1018
 
986
1019
  // Checkpoint turn linkage (Task 4.3): tag every checkpoint captured during
987
1020
  // this turn with the conversation point that produced it, so a future
@@ -1077,11 +1110,19 @@ function createAgentRunner({ chatStream, extractToolCalls, agentExecShell, agent
1077
1110
  ? (token) => {
1078
1111
  if (!streamStarted) {
1079
1112
  streamStarted = true;
1080
- if (cb.onStreamStart) cb.onStreamStart();
1113
+ // Pass the rail + inline-reasoning assertion so the UI gate can
1114
+ // decide whether it is safe to eager-open live narration on the
1115
+ // native rail. The XML rail (nativeTools false) ignores both.
1116
+ if (cb.onStreamStart) cb.onStreamStart(nativeTools, inlineReasoning);
1081
1117
  }
1082
1118
  parser.push(token);
1083
1119
  }
1084
1120
  : null;
1121
+ // Live-narration safety signal (a): surface the first reasoning_content
1122
+ // delta to the UI so it can eager-open the gate before content arrives.
1123
+ const wrappedOnReasoning = cb.onReasoningStart
1124
+ ? () => { cb.onReasoningStart(); }
1125
+ : null;
1085
1126
 
1086
1127
  const MAX_RETRIES = 3;
1087
1128
  const RETRYABLE_STATUS = new Set([408, 425, 429, 500, 502, 503, 504]);
@@ -1108,6 +1149,7 @@ function createAgentRunner({ chatStream, extractToolCalls, agentExecShell, agent
1108
1149
  linePrefix: wrappedOnToken ? '' : linePrefix,
1109
1150
  showThink,
1110
1151
  onToken: wrappedOnToken,
1152
+ onReasoning: wrappedOnReasoning,
1111
1153
  silent: !!wrappedOnToken,
1112
1154
  signal: controller.signal,
1113
1155
  onTrim: (info) => {
@@ -1312,7 +1354,19 @@ function createAgentRunner({ chatStream, extractToolCalls, agentExecShell, agent
1312
1354
  }
1313
1355
  }
1314
1356
  } else {
1315
- toolCalls = extractToolCalls(reply, { model });
1357
+ // No structured native tool_calls this turn. Parse the text for tool
1358
+ // calls — but on the NATIVE rail, suppress the soft TEXT HEURISTICS that
1359
+ // infer commands from untagged prose (the bare ```bash/```sh/```shell
1360
+ // fence pass). On the native rail a finish_reason=stop turn is usually a
1361
+ // plain text final answer, and an illustrative ```bash block in that
1362
+ // narration must NEVER be executed (the incident: a hung `su nobody` and
1363
+ // two placeholder examples were run). EXPLICIT tool-tag dispatch
1364
+ // (<exec>/<shell>/<write_file>/<minimax:tool_call>/<function=…>/MCP tags)
1365
+ // is deliberate and unambiguous, so it stays active on BOTH rails — the
1366
+ // native rail legitimately dispatches tools via those tags too. The XML
1367
+ // rail keeps every heuristic (byte-identical to before): it has no
1368
+ // structured channel, so the fence pass is part of its contract.
1369
+ toolCalls = extractToolCalls(reply, { model, skipTextHeuristics: nativeTools });
1316
1370
  }
1317
1371
  const isNativeCall = nativeToolCalls.length > 0;
1318
1372
  const cleanedReply = cleanAssistantContent(reply);
@@ -1442,10 +1496,21 @@ function createAgentRunner({ chatStream, extractToolCalls, agentExecShell, agent
1442
1496
  assistantMsg.tool_calls = nativeToolCalls.filter((tc) => acceptedSet.has(tc.id));
1443
1497
  }
1444
1498
  messages.push(assistantMsg);
1445
- // When showThink is off and the turn has tool calls, suppress the text bubble —
1446
- // pre-tool reasoning is noise, tool result bubbles already convey what happened.
1447
- const displayReply = (!showThink && toolCalls.length > 0) ? '' : cleanedReply;
1448
- if (cb.onAssistantMessage) cb.onAssistantMessage(displayReply);
1499
+ // Live narration (Claude-Code style): stream the model's pre-tool "what I'm
1500
+ // about to do" text instead of blanking it when tools are present. `cleanedReply`
1501
+ // has already had ALL reasoning stripped by cleanAssistantContent the implicit
1502
+ // </think> preamble (Qwen3-style) and any <think>/<reasoning>/<reflection>/<plan>
1503
+ // blocks, plus the tool tags — so no hidden reasoning leaks into the bubble or
1504
+ // persisted history. The implicit-think gate in chat-turn.js is the live-stream
1505
+ // safety net for the token-by-token path; here we simply stop forcing the
1506
+ // post-turn text to '' just because the iteration carried a tool call.
1507
+ const displayReply = cleanedReply;
1508
+ // `terminal` tells the UI a final answer from an intermediate tool-call
1509
+ // iteration. Previously the UI used "content is empty" as that proxy (blanked
1510
+ // tool iterations passed ''); now that intermediate iterations also carry
1511
+ // narration, the proxy is gone — pass the real signal so web-activity collapse
1512
+ // (which must only flush on the terminal answer) stays correct.
1513
+ if (cb.onAssistantMessage) cb.onAssistantMessage(displayReply, { terminal: toolCalls.length === 0 });
1449
1514
 
1450
1515
  if (toolCalls.length === 0) {
1451
1516
  // Native mode: tool_calls came in but none could be converted (parse
@@ -1567,6 +1632,17 @@ function createAgentRunner({ chatStream, extractToolCalls, agentExecShell, agent
1567
1632
  }
1568
1633
 
1569
1634
  const results = [];
1635
+ // view_image staging: encoded image records returned by view_image executors
1636
+ // this turn. Collected here and attached to the tool-result message's
1637
+ // `images[]` below, so api.js buildProviderMessages turns them into provider
1638
+ // vision blocks on the NEXT model turn (the same wire path /image uses).
1639
+ const stagedImages = [];
1640
+ // Phase 6a — serialized display descriptor cores, pushed in LOCKSTEP with
1641
+ // `results` (one entry per result, null when there is no descriptor — e.g.
1642
+ // a denied/withheld/hook-blocked call never reaches onToolEnd). Since
1643
+ // results[i] ↔ nativeToolCallIds[i] ↔ toolCalls[i], displayCores[i] aligns
1644
+ // with the native tool message pushed below and rides along as `_display`.
1645
+ const displayCores = [];
1570
1646
  const debugEntries = debug ? [] : null;
1571
1647
  let aborted = false;
1572
1648
 
@@ -1621,6 +1697,7 @@ function createAgentRunner({ chatStream, extractToolCalls, agentExecShell, agent
1621
1697
  if (cb.onError) cb.onError({ message: `PreToolUse hook blocked ${tag}.`, isWarning: true });
1622
1698
  logToolCall(tag, { args: call.slice(1) }, false, 'hook-blocked');
1623
1699
  results.push(resultStr);
1700
+ displayCores.push(null);
1624
1701
  if (debugEntries) debugEntries.push({ tag, call, ms: 0, status: 'hook_blocked', exitCode: null, result: resultStr });
1625
1702
  continue;
1626
1703
  }
@@ -1663,6 +1740,7 @@ function createAgentRunner({ chatStream, extractToolCalls, agentExecShell, agent
1663
1740
  if (cb.onError) cb.onError({ message: `Permission rule denied ${tag} (${ruleVerdict.reason}).`, isWarning: true });
1664
1741
  logToolCall((permDesc && permDesc.tag) || tag, { args: call.slice(1) }, false, `rule-denied:${ruleVerdict.reason}`);
1665
1742
  results.push(resultStr);
1743
+ displayCores.push(null);
1666
1744
  if (debugEntries) debugEntries.push({ tag, call, ms: 0, status: 'rule_denied', exitCode: null, result: resultStr, rule: ruleVerdict.reason });
1667
1745
  continue;
1668
1746
  }
@@ -1680,6 +1758,7 @@ function createAgentRunner({ chatStream, extractToolCalls, agentExecShell, agent
1680
1758
  if (cb.onPlanWithhold) cb.onPlanWithhold(tag, arg, permDesc);
1681
1759
  logToolCall(permDesc.tag || tag, { args: call.slice(1) }, false, 'withheld');
1682
1760
  results.push(resultStr);
1761
+ displayCores.push(null);
1683
1762
  if (debugEntries) debugEntries.push({ tag, call, ms: 0, status: 'withheld', exitCode: null, result: resultStr });
1684
1763
  continue;
1685
1764
  }
@@ -1707,6 +1786,7 @@ function createAgentRunner({ chatStream, extractToolCalls, agentExecShell, agent
1707
1786
  : `${tag} ${arg}: Permission denied by user.${reasonSuffix}`;
1708
1787
  logToolCall(permTag, { args: call.slice(1) }, false, 'denied');
1709
1788
  results.push(resultStr);
1789
+ displayCores.push(null);
1710
1790
  if (debugEntries) debugEntries.push({ tag, call, ms: 0, status: 'denied', exitCode: null, result: resultStr, rule: ruleVerdict.reason || undefined });
1711
1791
  aborted = true;
1712
1792
  break;
@@ -1733,8 +1813,9 @@ function createAgentRunner({ chatStream, extractToolCalls, agentExecShell, agent
1733
1813
  const oneLine = String(arg).replace(/\s+/g, ' ').trim();
1734
1814
  const truncatedCmd = oneLine.length > 80 ? oneLine.slice(0, 77) + '...' : oneLine;
1735
1815
  const resultStr = `User interrupted execution after ${elapsedS}s. Tool was running: ${truncatedCmd}. Plan around this — do not retry the same long-running command.`;
1736
- if (cb.onToolEnd) cb.onToolEnd(tag, resultStr, ms, { id: invocationId, call, attrs, meta: null, error: { message: 'aborted' } });
1816
+ const displayCore = cb.onToolEnd ? cb.onToolEnd(tag, resultStr, ms, { id: invocationId, call, attrs, meta: null, error: { message: 'aborted' } }) : null;
1737
1817
  results.push(resultStr);
1818
+ displayCores.push(displayCore || null);
1738
1819
  if (debugEntries) debugEntries.push({ tag, call, ms, status: 'aborted', exitCode: null, result: resultStr });
1739
1820
  aborted = true;
1740
1821
  break;
@@ -1754,8 +1835,9 @@ function createAgentRunner({ chatStream, extractToolCalls, agentExecShell, agent
1754
1835
  const error = shellResult.exit_code !== 0
1755
1836
  ? { message: `exit ${shellResult.exit_code}`, code: shellResult.exit_code }
1756
1837
  : null;
1757
- if (cb.onToolEnd) cb.onToolEnd(tag, resultStr, ms, { id: invocationId, call, attrs, meta, error });
1838
+ const displayCore = cb.onToolEnd ? cb.onToolEnd(tag, resultStr, ms, { id: invocationId, call, attrs, meta, error }) : null;
1758
1839
  results.push(await augmentWithHooks(tag, attrs, resultStr, preFeedback));
1840
+ displayCores.push(displayCore || null);
1759
1841
  if (debugEntries) debugEntries.push({
1760
1842
  tag,
1761
1843
  call,
@@ -1783,19 +1865,35 @@ function createAgentRunner({ chatStream, extractToolCalls, agentExecShell, agent
1783
1865
  const oneLine = String(arg).replace(/\s+/g, ' ').trim();
1784
1866
  const truncatedArg = oneLine.length > 80 ? oneLine.slice(0, 77) + '...' : oneLine;
1785
1867
  const resultStr = `User interrupted execution after ${elapsedS}s. Tool was running: ${tag} ${truncatedArg}. Plan around this — do not retry the same long-running operation.`;
1786
- if (cb.onToolEnd) cb.onToolEnd(tag, resultStr, ms, { id: invocationId, call, attrs, meta: null, error: { message: 'aborted' } });
1868
+ const displayCore = cb.onToolEnd ? cb.onToolEnd(tag, resultStr, ms, { id: invocationId, call, attrs, meta: null, error: { message: 'aborted' } }) : null;
1787
1869
  results.push(resultStr);
1870
+ displayCores.push(displayCore || null);
1788
1871
  if (debugEntries) debugEntries.push({ tag, call, ms, status: 'aborted', exitCode: null, result: resultStr });
1789
1872
  aborted = true;
1790
1873
  break;
1791
1874
  } else {
1792
1875
  const resultStr = formatFileResult(call, fileResult);
1876
+ // view_image: stage the encoded image so it attaches to this turn's
1877
+ // tool-result message (below) and reaches the model as a vision block
1878
+ // next turn — same mechanism /image uses, no parallel encoder.
1879
+ if (fileResult && fileResult.image && typeof fileResult.image.data === 'string') {
1880
+ stagedImages.push(fileResult.image);
1881
+ }
1793
1882
  const meta = _metaForTool(tag, fileResult);
1794
1883
  const error = fileResult.error
1795
1884
  ? { message: fileResult.error, code: fileResult.error_code || null }
1796
1885
  : null;
1797
- if (cb.onToolEnd) cb.onToolEnd(tag, resultStr, ms, { id: invocationId, call, attrs, meta, error });
1886
+ // File-edit diff payload (execution-time rendering). Mutating file
1887
+ // tools attach _diffBefore/_diffAfter; hand them to onToolEnd so the
1888
+ // UI renders the diff for EVERY edit, independent of the permission
1889
+ // modal or approval state. Absent on non-mutating/loaded calls → null.
1890
+ const diff = (fileResult && typeof fileResult._diffBefore === 'string'
1891
+ && typeof fileResult._diffAfter === 'string')
1892
+ ? { before: fileResult._diffBefore, after: fileResult._diffAfter, path: fileResult.path || call[1] }
1893
+ : null;
1894
+ const displayCore = cb.onToolEnd ? cb.onToolEnd(tag, resultStr, ms, { id: invocationId, call, attrs, meta, error, diff }) : null;
1798
1895
  results.push(await augmentWithHooks(tag, attrs, resultStr, preFeedback));
1896
+ displayCores.push(displayCore || null);
1799
1897
  if (debugEntries) debugEntries.push({
1800
1898
  tag,
1801
1899
  call,
@@ -1807,7 +1905,7 @@ function createAgentRunner({ chatStream, extractToolCalls, agentExecShell, agent
1807
1905
  }
1808
1906
  } catch (err) {
1809
1907
  const ms = Date.now() - toolStart;
1810
- if (cb.onToolEnd) cb.onToolEnd(tag, `Error: ${err.message}`, ms, { id: invocationId, call, attrs, meta: null, error: err });
1908
+ const displayCore = cb.onToolEnd ? cb.onToolEnd(tag, `Error: ${err.message}`, ms, { id: invocationId, call, attrs, meta: null, error: err }) : null;
1811
1909
  if (cb.onError) {
1812
1910
  cb.onError({ message: `Tool error (${tag}): ${err.message}`, isWarning: true });
1813
1911
  } else {
@@ -1815,6 +1913,7 @@ function createAgentRunner({ chatStream, extractToolCalls, agentExecShell, agent
1815
1913
  }
1816
1914
  logToolCall(tag, { args: call.slice(1) }, false, 'error');
1817
1915
  results.push(`${tag}: Error — ${err.message}`);
1916
+ displayCores.push(displayCore || null);
1818
1917
  if (debugEntries) debugEntries.push({ tag, call, ms, status: 'exception', exitCode: null, result: `Error — ${err.message}` });
1819
1918
  }
1820
1919
  }
@@ -1891,7 +1990,7 @@ function createAgentRunner({ chatStream, extractToolCalls, agentExecShell, agent
1891
1990
  const reason = isAborted() ? 'user interrupted' : 'after user denied an action';
1892
1991
  if (isNativeCall) {
1893
1992
  for (let i = 0; i < results.length; i++) {
1894
- messages.push({ role: 'tool', tool_call_id: nativeToolCallIds[i], content: results[i] });
1993
+ messages.push(_nativeToolMessage(nativeToolCallIds[i], results[i], displayCores[i]));
1895
1994
  }
1896
1995
  } else {
1897
1996
  messages.push({
@@ -1905,14 +2004,44 @@ function createAgentRunner({ chatStream, extractToolCalls, agentExecShell, agent
1905
2004
 
1906
2005
  if (isNativeCall) {
1907
2006
  for (let i = 0; i < results.length; i++) {
1908
- messages.push({ role: 'tool', tool_call_id: nativeToolCallIds[i], content: results[i] });
2007
+ messages.push(_nativeToolMessage(nativeToolCallIds[i], results[i], displayCores[i]));
2008
+ }
2009
+ // view_image on the native rail: OpenAI `tool` messages can't carry image
2010
+ // parts, so stage the encoded image(s) on a trailing user turn (exactly the
2011
+ // /image mechanism). buildProviderMessages turns `images[]` into vision
2012
+ // blocks on the next request; the text result already landed on the tool
2013
+ // message above.
2014
+ if (stagedImages.length) {
2015
+ messages.push({
2016
+ role: 'user',
2017
+ content: 'The image(s) requested via view_image are attached to this message for your analysis. '
2018
+ + 'They are visible to you (the model) only — not shown to the user.',
2019
+ images: stagedImages,
2020
+ });
1909
2021
  }
1910
2022
  } else {
1911
2023
  const feedback = results.join('\n\n');
1912
- messages.push({
2024
+ // Phase 6b — XML rail replay parity. The feedback blob folds every tool
2025
+ // result of this turn into ONE {role:'user'} message and cannot be split
2026
+ // back by parsing (the only separator, \n\n, appears freely inside result
2027
+ // bodies). So persist the per-call display descriptors as a sibling
2028
+ // `_display[]` aligned 1:1 with `results` (same serialized cores the
2029
+ // native rail attaches, see _nativeToolMessage), preserving `null`s for
2030
+ // ops with no descriptor. `content` stays BYTE-IDENTICAL (Inv. 1) — the
2031
+ // model never sees `_display` (stripInternalKeys drops it before the wire).
2032
+ // Replay (chat-session.displayLoadedMessages) only renders per-call when
2033
+ // EVERY slot is a non-null known-version core; a single `null` (e.g. a web
2034
+ // op, out of scope until 6c) keeps the whole blob on the legacy summary.
2035
+ const resultsMsg = {
1913
2036
  role: 'user',
1914
2037
  content: `Tool execution results:\n\n${feedback}\n\nContinue with the task. If everything is done, summarize what was accomplished.`,
1915
- });
2038
+ _display: displayCores.slice(),
2039
+ };
2040
+ // view_image on the XML rail: the tool-result blob is a single user
2041
+ // message, which CAN carry image parts — attach the staged image(s) so
2042
+ // buildProviderMessages renders them as vision blocks next turn.
2043
+ if (stagedImages.length) resultsMsg.images = stagedImages;
2044
+ messages.push(resultsMsg);
1916
2045
  }
1917
2046
  }
1918
2047