npm - @semalt-ai/code - Versions diffs - 1.8.5 → 1.20.0 - Mend

@semalt-ai/code 1.8.5 → 1.20.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (192) hide show

package/.claude/settings.local.json +7 -1
package/.github/workflows/ci.yml +69 -0
package/ARCHITECTURE.md +6 -95
package/CLAUDE.md +196 -316
package/README.md +148 -4
package/docs/ARCHITECTURE.md +1321 -0
package/docs/CONFIG.md +340 -0
package/docs/HISTORY.md +245 -0
package/examples/embed.js +74 -0
package/index.js +251 -10
package/lib/agent.js +856 -120
package/lib/api.js +239 -50
package/lib/args.js +74 -2
package/lib/audit.js +23 -1
package/lib/background.js +584 -0
package/lib/checkpoints.js +757 -0
package/lib/commands/auth.js +94 -0
package/lib/commands/chat-session.js +489 -0
package/lib/commands/chat-slash.js +415 -0
package/lib/commands/chat-turn.js +669 -0
package/lib/commands/chat.js +407 -0
package/lib/commands/custom.js +157 -0
package/lib/commands/history-utils.js +66 -0
package/lib/commands/index.js +268 -0
package/lib/commands/mcp.js +113 -0
package/lib/commands/oneshot.js +193 -0
package/lib/commands/registry.js +269 -0
package/lib/commands/tasks.js +89 -0
package/lib/compact.js +87 -0
package/lib/config.js +360 -11
package/lib/constants.js +401 -3
package/lib/deny.js +199 -0
package/lib/doctor.js +160 -0
package/lib/headless.js +202 -0
package/lib/hooks.js +286 -0
package/lib/images.js +270 -0
package/lib/internals.js +49 -0
package/lib/mcp/boundary.js +131 -0
package/lib/mcp/client.js +270 -0
package/lib/mcp/oauth.js +134 -0
package/lib/memory.js +209 -0
package/lib/metrics.js +37 -2
package/lib/payload.js +54 -0
package/lib/permission-rules.js +401 -0
package/lib/permissions.js +123 -26
package/lib/pricing.js +67 -0
package/lib/proc.js +62 -0
package/lib/prompts.js +99 -8
package/lib/sandbox.js +568 -0
package/lib/sdk.js +328 -0
package/lib/secrets.js +211 -0
package/lib/skills.js +223 -0
package/lib/subagents.js +516 -0
package/lib/tool_registry.js +2862 -0
package/lib/tool_specs.js +263 -9
package/lib/tools.js +352 -1039
package/lib/ui/anim.js +86 -0
package/lib/ui/ansi.js +17 -27
package/lib/ui/chat-history.js +253 -71
package/lib/ui/create-ui.js +67 -24
package/lib/ui/diff.js +90 -25
package/lib/ui/file-activity.js +236 -0
package/lib/ui/format.js +195 -29
package/lib/ui/input-field.js +21 -11
package/lib/ui/md-stream.js +234 -0
package/lib/ui/render-operation.js +113 -0
package/lib/ui/select.js +1 -4
package/lib/ui/status-bar.js +146 -36
package/lib/ui/stream.js +20 -13
package/lib/ui/theme.js +190 -44
package/lib/ui/tool-operation.js +190 -0
package/lib/ui/utils.js +9 -5
package/lib/ui/web-activity.js +270 -0
package/lib/ui/writer.js +159 -45
package/lib/ui.js +1 -1
package/lib/verify.js +229 -0
package/lib/web-extract.js +213 -0
package/lib/web-summarize.js +68 -0
package/package.json +19 -4
package/scripts/lint.js +57 -0
package/test/agent-loop.test.js +389 -0
package/test/anim-driver.test.js +153 -0
package/test/ask-user-display.test.js +226 -0
package/test/ask-user-gate.test.js +231 -0
package/test/background.test.js +414 -0
package/test/chat-history-nocolor.test.js +155 -0
package/test/chat-relogin.test.js +207 -0
package/test/chat.test.js +114 -0
package/test/checkpoints-agent.test.js +181 -0
package/test/checkpoints.test.js +650 -0
package/test/command-registry.test.js +160 -0
package/test/compact.test.js +116 -0
package/test/completion-lazy.test.js +52 -0
package/test/config-merge.test.js +324 -0
package/test/config-quarantine.test.js +128 -0
package/test/config-write-guard-allow-anywhere.test.js +56 -0
package/test/config-write-guard-skip.test.js +46 -0
package/test/config-write-guard.test.js +153 -0
package/test/context-split.test.js +215 -0
package/test/cost-doctor.test.js +142 -0
package/test/custom-commands-chat.test.js +106 -0
package/test/custom-commands.test.js +230 -0
package/test/defer-detail-band.test.js +403 -0
package/test/deny-windows.test.js +120 -0
package/test/deny.test.js +83 -0
package/test/detail-band-tab-flatten.test.js +242 -0
package/test/download-allow-anywhere.test.js +66 -0
package/test/download-confine.test.js +153 -0
package/test/exec-diff.test.js +268 -0
package/test/executors.test.js +599 -0
package/test/extract-tool-calls.test.js +349 -0
package/test/fetch-url-validation.test.js +219 -0
package/test/file-activity.test.js +522 -0
package/test/fixtures/tool-calls.js +57 -0
package/test/fixtures/web-page.js +91 -0
package/test/git-tools.test.js +384 -0
package/test/grep-glob-serialize.test.js +242 -0
package/test/grep-glob.test.js +268 -0
package/test/grep-path-target.test.js +227 -0
package/test/harness/README.md +57 -0
package/test/harness/chat-harness.js +143 -0
package/test/harness/memwarn-headless-child.js +65 -0
package/test/harness/mock-llm.js +120 -0
package/test/harness/mock-mcp-server.js +142 -0
package/test/harness/sse-server.js +69 -0
package/test/headless.test.js +348 -0
package/test/history-utils.test.js +88 -0
package/test/hooks-agent.test.js +238 -0
package/test/hooks-verify-sandbox.test.js +232 -0
package/test/hooks.test.js +216 -0
package/test/http-get-user-agent.test.js +142 -0
package/test/images-api.test.js +208 -0
package/test/images.test.js +238 -0
package/test/input-field-ctrl-o.test.js +37 -0
package/test/live-height-physical.test.js +281 -0
package/test/max-iterations.test.js +218 -0
package/test/mcp-boundary.test.js +57 -0
package/test/mcp-client.test.js +267 -0
package/test/mcp-oauth.test.js +86 -0
package/test/md-stream.test.js +183 -0
package/test/memory-truncation-warning.test.js +222 -0
package/test/memory.test.js +198 -0
package/test/native-dispatch.test.js +409 -0
package/test/native-live-narration.test.js +254 -0
package/test/output-chokepoint.test.js +188 -0
package/test/output-heredoc-leak.test.js +195 -0
package/test/output-preview.test.js +245 -0
package/test/path-guards.test.js +134 -0
package/test/payload.test.js +99 -0
package/test/permission-rules-agent.test.js +210 -0
package/test/permission-rules.test.js +297 -0
package/test/permissions.test.js +362 -0
package/test/plan-mode.test.js +167 -0
package/test/read-paginate.test.js +275 -0
package/test/readonly-tools.test.js +177 -0
package/test/render-operation.test.js +317 -0
package/test/replay-descriptor-xml.test.js +216 -0
package/test/replay-descriptor.test.js +189 -0
package/test/replay-web-aggregate.test.js +291 -0
package/test/replay-web-persist.test.js +241 -0
package/test/result-cap.test.js +233 -0
package/test/running-glyph-anim.test.js +111 -0
package/test/sandbox-agent.test.js +147 -0
package/test/sandbox-integration.test.js +216 -0
package/test/sandbox.test.js +408 -0
package/test/sdk.test.js +234 -0
package/test/shell-output-cap.test.js +181 -0
package/test/skills-chat.test.js +110 -0
package/test/skills.test.js +295 -0
package/test/smoke.test.js +68 -0
package/test/status-bar-driver.test.js +93 -0
package/test/status-bar-pause.test.js +164 -0
package/test/status-bar-resync.test.js +188 -0
package/test/stream-parser.test.js +171 -0
package/test/subagents-agent.test.js +178 -0
package/test/subagents.test.js +222 -0
package/test/theme-palette.test.js +166 -0
package/test/tool-registry.test.js +85 -0
package/test/trim-budget.test.js +101 -0
package/test/truncate-visible.test.js +78 -0
package/test/verify-agent.test.js +317 -0
package/test/verify.test.js +141 -0
package/test/view-image.test.js +199 -0
package/test/web-activity-ordering.test.js +203 -0
package/test/web-activity.test.js +207 -0
package/test/web-data-extraction-guidance.test.js +71 -0
package/test/web-extract.test.js +185 -0
package/test/web-fetch-agent.test.js +291 -0
package/test/web-fetch-mode.test.js +193 -0
package/test/web-search.test.js +380 -0
package/lib/commands.js +0 -1438
package/path +0 -1

package/docs/HISTORY.md ADDED Viewed

@@ -0,0 +1,245 @@
+# semalt-code — History, Decisions & Rationale
+> Dependency-policy rationale, the full "Key Patterns & Invariants" reference,
+> and the "Deferred / Not Yet Implemented" roadmap. **Not auto-loaded** as project
+> memory. The lean `CLAUDE.md` carries the compressed, verified invariant set;
+> this file preserves the long-form rationale and the per-task history.
+> Per-task (Task X.Y) rationale and the "Tested by …" enumerations live inline in
+> `docs/ARCHITECTURE.md` alongside each subsystem they describe.
+---
+## Dependency & Supply-Chain Policy (Task 3.2)
+The project ran **zero runtime dependencies** through Phase 2. Adopting the official
+MCP SDK (`@modelcontextprotocol/sdk`) in v1.9.0 ends that era. The invariant is now
+**minimal, vetted, pinned dependencies** — not "no dependencies."
+**When a runtime dependency is allowed.** Every new runtime dependency must be:
+1. **Minimal** — preferred only when a Node.js built-in genuinely cannot do the job.
+   The bar for the *first* dependency was high on purpose; the bar for the next one
+   is the same. Dev-only tooling is still avoided (we lint with `node --check` and
+   test with `node:test`).
+2. **Justified** — a one-line rationale recorded here (see below) and in the PR.
+3. **Pinned to an exact version** — no `^`/`~`/ranges in `package.json`. Upgrades are
+   deliberate, reviewed commits, never silent on `npm install`.
+4. **Reviewed** — adding/bumping a dependency is a reviewed change, and the
+   regenerated `package-lock.json` is committed in the same PR.
+**Rationale for the web-extraction deps (Task W.1, all pinned exact).** The
+web-fetch pipeline (see **Web Fetch Pipeline** below) turns raw HTML into
+main-content Markdown — reliably parsing real-world malformed HTML, scoring the
+main article over chrome, and emitting clean Markdown are each large, bug-prone
+surfaces where a hand-rolled regex approach is exactly the wrong call (quality is
+the whole point). The chosen libraries are the reference implementations:
+- **`@mozilla/readability` (`0.6.0`)** — Firefox Reader View's extractor; the
+  de-facto standard for "main content of a page." MIT. **Zero transitive deps.**
+- **`turndown` (`7.2.4`)** — the reference HTML→Markdown converter. MIT. One
+  transitive dep (`@mixmark-io/domino`, a DOM impl).
+- **`linkedom` (`0.18.12`)** — a light DOM for Readability to operate on
+  (`jsdom` is far heavier and unnecessary here). MIT. Transitive footprint:
+  `css-select`, `css-what`, `boolbase`, `nth-check`, `domhandler`,
+  `domelementtype`, `domutils`, `dom-serializer`, `entities`, `cssom`,
+  `htmlparser2`, `html-escaper`, `uhyphen` (`canvas` is an *optional* dep, left
+  uninstalled). **Total added: ~18 packages, `npm audit` clean (0 advisories).**
+All three are loaded directly (CommonJS-compatible) from `lib/web-extract.js` —
+no ESM boundary needed (unlike the MCP SDK).
+**Rationale for `@modelcontextprotocol/sdk` (pinned `1.29.0`).** MCP is an open
+protocol with a non-trivial wire contract (JSON-RPC framing, capability negotiation,
+transport lifecycle, schema validation). Reimplementing it by hand would be a large,
+bug-prone surface to own and keep in spec. The **official** SDK is the reference
+implementation, MIT-licensed, and tracks the spec — exactly the case where a vetted
+dependency beats a built-in reimplementation. It is the foundation Task 3.3 builds the
+MCP client on.
+**ESM/CJS boundary.** The SDK is **ESM-only** (`"type": "module"`); this project is
+CommonJS. A CJS module cannot `require()` an ESM-only package. The entire codebase
+stays CommonJS — the SDK is loaded in exactly one place, `lib/mcp/boundary.js`, via
+dynamic `import()`, which re-exposes a CJS-friendly async surface (`loadSdk`,
+`createClient`, `createStdioTransport`). No other module imports the SDK directly.
+See **MCP Boundary** below.
+**Lockfile + CI guardrails.** `package-lock.json` is committed. CI (`.github/workflows/ci.yml`) runs:
+- `npm ci` — installs strictly from the lockfile; fails on package.json↔lockfile drift (integrity).
+- `npm audit --omit=dev --audit-level=high` — fails the build on a **HIGH or CRITICAL**
+  advisory in the **runtime** (production) dependency tree. Dev deps are excluded
+  (there are none today).
+**Audit-findings policy.** When `npm audit` flags an advisory:
+- **Critical / High** → **blocking.** CI fails. Resolve before merge by bumping to a
+  patched pinned version (regenerate + commit the lockfile), or — if no fix exists —
+  removing/replacing the dependency. A temporary, time-boxed exception requires an
+  explicit `npm audit` allow-list entry **with a written justification and a tracking
+  issue**; it is not the default.
+- **Moderate / Low** → **non-blocking** (the `--audit-level=high` gate lets them pass)
+  but **tracked**: open an issue and address on the next dependency-maintenance pass.
+  Do not raise the gate to fail on these without agreement — noisy gates get ignored.
+- **Routine maintenance** → periodically run `npm audit` and `npm outdated`; dependency
+  bumps follow the pinning + review rules above.
+---
+## Key Patterns & Invariants
+- **Minimal, pinned dependencies**: prefer Node.js built-ins; a runtime dependency must be minimal, justified, pinned to an exact version, and reviewed (see **Dependency & Supply-Chain Policy**). Today: `@modelcontextprotocol/sdk` (MCP) and the web-extraction set `@mozilla/readability` + `linkedom` + `turndown` (Task W.1).
+- **CommonJS**: all files use `require()`/`module.exports`. Do not use ES `import`/`export`. The one exception is the **dynamic** `import()` inside `lib/mcp/boundary.js`, which is the sole bridge to the ESM-only MCP SDK — the project itself stays CommonJS.
+- **Streaming**: `api.js` manually parses `text/event-stream`. The parser in `chatStream()` handles partial JSON lines — be careful editing it.
+- **Permissions are per-session**: `PermissionManager` resets on each CLI invocation. Approvals never persist to disk. In non-TTY mode tool calls that would normally need interactive confirmation are **refused** (not auto-approved) unless `--dangerously-skip-permissions` is set, or the tag is pre-approved by an `--allow-*` tier flag.
+- **Destructive-command deny-list** (`lib/deny.js`): every shell call (`exec`/`shell`) passes through `classifyShellCommand()` at the single chokepoint in `agentExecShell`, in *all* modes and regardless of `--allow-*` flags. Handling depends on the **initiator**:
+  - **Agent-initiated** (the model asked, the default): any deny-list hit is a **hard block** — `rm -rf`, `curl … | sh`, disk-wipe/fork-bomb patterns, recursive chmod/chown on a system root, and writes to system paths.
+  - **User-initiated** (a human typed `!cmd` or `semalt-code shell`): the user owns their machine, so a deny-list hit is **not** hard-blocked. The exception is the **catastrophic subset** (`catastrophic: true` — disk-wipe / block-device write, fork bomb), which interposes a single y/N confirmation as a typo guard; all other deny-listed user commands run with a `bypassed` note.
+  - The only full bypass (skips classification entirely) is `--dangerously-skip-permissions`.
+  - **Cross-platform + canonicalized (Task 4.4):** the list now covers the
+    **Windows** destructive set (`del /s`, `rd`/`rmdir /s`, `Remove-Item -Recurse
+    -Force`, `format`, `Format-Volume`, `Clear-Disk`, `cipher /w`, `diskpart …
+    clean`) in addition to POSIX — relevant because native Windows has no OS
+    sandbox. Matching also runs against a **procfs-root-canonicalized** variant
+    (`/proc/self/root` and `/proc/<pid>/root` rewritten to `/`) so a
+    `/proc/self/root/etc/…` bypass is caught by the same system-path matchers
+    (the resolved-path principle, shared with the OS sandbox).
+- **Untrusted web content**: `http_get` runs the **web-fetch pipeline** (Task W.1 / W.1b, `mode` = summarized→extract→Markdown→secondary-LLM summary / extracted→Markdown / raw→original token-capped content) so by default only a compact result enters context (`raw` mode deliberately returns the original markup, still **token-capped**, for page analysis); the result in **every** mode is wrapped in the explicit `<<<UNTRUSTED_EXTERNAL_CONTENT>>>` block (`lib/agent.js`), and the secondary summarizer treats the page as data-only (a page injection could have steered it). The system prompt (`lib/prompts.js`) instructs the model never to act on instructions inside such a block. MCP tool results and **lifecycle-hook output** reuse the same fence. See **Web Fetch Pipeline**.
+- **Lifecycle hooks are deny-listed + sandboxed shell + untrusted output** (`lib/hooks.js`): a `PreToolUse` non-zero exit blocks the tool; every hook command passes through `checkShellDenylist` AND the **OS sandbox** (`resolveSandboxedSpawn`, Pre-Task 5.0a) before running; hook stdout is fenced as untrusted before it reaches the model; timeouts/sandbox-refusals/failures are contained and never crash the loop. **Project-layer command hooks and `verify.command` are quarantined** (`loadHookLayers`/`loadVerifyLayers`): a cloned-repo `.semalt/config.json` can never introduce host-privileged execution, only inert prompt text.
+- **`--readonly` blocks every file-mutating tool** (`READONLY_BLOCKED`, `lib/permissions.js`, completed in Pre-Task 5.0c): `write_file`, `append_file`, `edit_file`, `replace_in_file`, `delete_file`, `make_dir`, `remove_dir`, `move_file`, `copy_file`, `upload`, `download`. The block is enforced at the executor (`permissionManager.readonlyBlock(tag)`), so it holds for both the XML and native paths; `describePermission` also short-circuits the gate (no approval prompt precedes the deterministic block). **Scope decision (load-bearing): `--readonly` governs FILE TOOLS only.** Shell (`exec`/`shell`) is **not** in the set — a read-only session must still run read-only commands (`ls`, `git status`), and a shell command's arbitrary write side effects are the **OS sandbox + deny-list's** job to confine (the right layer post-Pre-Task 5.0a), not `--readonly`. So `--readonly` is an honest "no file-tool writes," not a false "no writes at all." Read-only file tools (`read_file`, `grep`, `glob`, `search_in_file`, `file_stat`, `list_dir`) work unchanged. Tested by `test/readonly-tools.test.js`.
+- **Secret-file read guard**: `isProtectedSecretPath()` in `tools.js` refuses reads/copies/moves of `config.json`, `memory.json`, and `audit.log` via file tools — **not** overridable by `--allow-anywhere` (only by `--dangerously-skip-permissions`).
+- **Config-write guard** (`isProtectedConfigPath()` in `tools.js`, Pre-Task 5.0b): the write-side companion to the read guard. Every write executor (`write_file`, `append_file`, `edit_file`, `replace_in_file`, `move_file`/`copy_file` **dst**, `upload`, `download`) refuses to write into the **protected-config set** — the whole `~/.semalt-ai` dir **and** every project `.semalt` dir from the CWD up to the repo root, **including files that do not yet exist** (directory-prefix matched on the resolved path, so a missing `.semalt/config.json`/`agents/*.md`/hook is covered). The set is defined once as `protectedConfigDirs` (`lib/constants.js`) and shared with the OS sandbox's `protectedPaths`. Same bypass policy as the read guard: **not** overridable by `--allow-anywhere`, only by `--dangerously-skip-permissions` (human-only). This guards the **agent's** file tools and the sandboxed shell — a human editing their own config in an editor is unaffected. Tested by `test/config-write-guard*.test.js`, `test/path-guards.test.js`, and the kernel case in `test/sandbox-integration.test.js`.
+- **Per-pattern permission rules** (`lib/permission-rules.js`, Task 4.1): allow/deny/ask rules matching tool + argument (glob/regex), layered user→project. **Project rules can only NARROW** — every project `allow` is structurally dropped before resolution, so a cloned-repo `.semalt/config.json` can never widen the user posture. Precedence is total/deterministic (deny>ask>allow, most-specific then most-restrictive). Arguments are canonicalized (`..`/symlink/abs-rel) before matching; pathological/malformed rules fail closed; an `allow` never bypasses the deny-list, secret guard, `--readonly`, or `isPathSafe` (those stay in the executors). A `deny` rule holds even under `--dangerously-skip-permissions`. See **Per-Pattern Permissions** above.
+- **Checkpoints & rewind** (`lib/checkpoints.js`, Task 4.3 / 4.3b): before each file-tool mutation the file's prior state is snapshotted (post-gate, pre-mutation, in `agentExecFile`) so `/rewind` can restore it — **file-tool changes only; shell side effects are not reversible.** Capture is fail-safe (a snapshot failure never blocks the mutation); a denied/withheld call produces no checkpoint; subagent mutations are checkpointed into the parent session. Delete/move are reversed explicitly; an external-modification check warns/asks before clobbering out-of-band edits. A per-file size cap and per-session retention are enforced. **Rewind is human-only (no rewind tool in the registry).** Task 4.3b: the restore path **re-validates the current guards** (`isPathSafe`/secret/protected-config/`deny` rule) per target — a now-forbidden path is refused/skipped, and `force` overrides only the external-mod check, not the guards; **three restore modes** `code`/`conversation`/`both` (default both) restore files, history, or the linked state, with conversation truncation cutting on **turn boundaries** (no orphaned `tool_call`; discard policy) — all on the **unchanged** on-disk schema. See **Checkpoints & Rewind** above.
+- **Native git tools** (`lib/tool_registry.js`, Task 5.1): eight first-class git tools shelling out through the **same** `agentExecShell` sandbox + deny-list chokepoint as `<shell>` (no privileged path around confinement), parsing output into structured results. Read-only (`git_status`/`git_diff`/`git_log`, plus the *list* ops of `git_branch`/`git_worktree`) return a null permission descriptor; mutating (`git_add`/`git_commit`/`git_branch`/`git_checkout`/`git_worktree` add/remove) require approval, honor `--readonly`, and pass the per-pattern rules. `git_commit` requires a real non-empty message (empty → error, never a placeholder). **Destructive-git ↔ checkpoint honesty:** git operations are NOT reversible via `/rewind` (checkpoints snapshot file-tool mutations only) — stated in the descriptions and prompt text. Not-a-repo / git-absent degrade gracefully. See **Native Git Tools** above.
+- **API-key sourcing** (`lib/secrets.js`): precedence is `SEMALT_API_KEY` env → OS keychain (macOS `security` / Linux `secret-tool` / Windows PasswordVault) → `config.json`. Keys from env/keychain are never written back to config; `configShow` reports only `api_key_source`. Store a key with `semalt-code auth set-key`.
+- **Token counting is approximate**: `estimateTokens()` divides char count by 4. It is used only for the `/compact` display — do not rely on it for hard limits.
+- **Context trimming is proactive when a limit is known**: `chatStream()` uses the in-process `_sessionInputLimits` learned from a prior 400 overflow first, then falls back to `config.context_length * 0.9`. When neither is set, no pre-flight trim runs and the client relies on the reactive 400/413 handler (which then persists the discovered window). `Metrics.tokenLimitStatus()` returns `{ used, limit: null }` until a limit is learned, so the status bar shows "N tok · limit unknown" instead of hiding the line.
+- **Shell/exec output entering context is bounded** (Task W.6, `capShellOutput` in `lib/agent.js`): the model-facing shell result is double-bounded — a **head+tail line cap** (`max_output_lines`, default 50, split first ~60% + last ~40% via `OUTPUT_HEAD_RATIO`) eliding the middle, **then** a **token safety net** (`max_output_tokens`, default 10000, reusing the web pipeline's `capToTokens`) so a few enormous lines (minified JS, a binary `cat`) can't blow context. The elision notice teaches the W.5-enabled redirect-to-file→grep pattern. **The exit code stays on its own line, so truncating output VOLUME never hides the command's OUTCOME** (a non-zero exit / failure is always surfaced). Applied at the context boundary in the agent loop — distinct from the **UI** cap (`lib/ui/diff.js`, display only), which stays. Before W.6 the cap was UI-only and the model received the **entire** unbounded stdout+stderr (the #1 context risk). Pure helper, unit-tested on the model-facing text + a real-loop assertion (`test/shell-output-cap.test.js`). MCP/subagent output bounding is Task W.8 (below); W.9 unifies all the paths into a shared chokepoint.
+- **MCP & subagent results entering context are bounded** (Task W.8, `formatMcpResult`/`formatSubagentResult` in `lib/agent.js`): the last two unbounded paths. Both apply `capToTokens` (the W.5–W.7 standard) to the result text **before** wrapping it in the `<<<UNTRUSTED_EXTERNAL_CONTENT>>>` fence, with **distinct budgets reflecting their nature**: **MCP is stricter** (`mcp.max_result_tokens`, default **10000**) because the payload size is third-party/server-controlled and untrusted — the riskiest path; **subagent is generous** (`subagents.max_result_tokens`, default **20000**) because the child's final text is our own deliberate, synthesized answer (a safety net against a verbose child). For MCP the truncation notice sits **inside** the fence with the capped content — capping never weakens the untrusted perimeter; subagent isolation / no-escalation (3.6/4.5) are unchanged (this bounds returned-text size only). A small result passes through fully, no notice. Pure helpers, unit-tested on the model-facing/parent-facing text incl. the fence-still-present and budgets-differ cases + real-loop assertions (`test/result-cap.test.js`).
+- **`read_file` is paginated** (Task W.7, `formatReadResult` in `lib/agent.js`): `read_file` used to dump the **whole file verbatim** into context (`File <path>:\n` + the entire content); the only guard was a hard byte refusal at `max_file_size_kb`. Worst case ~128k tokens for a 500 KB file. Now the **model-facing** result is paginated, mirroring the Claude Code standard: under a **line cap** (`read_line_cap`, default **2000**) the file reads **byte-for-byte as before** (no regression for the common small-file case); over the cap it returns the first page + a **`[PARTIAL]` notice** — `Showing lines 1–2000 of 5234. Read more with start_line=2001.` **`start_line`/`end_line`** (on both XML + native rails; absent → null, tuple parity) read an explicit slice, **also line-capped** so a huge explicit range can't dump everything. A **token safety net** (`read_max_tokens`, default **25000**, reusing the web pipeline's `capToTokens`) bounds the pathological few-but-enormous-lines case (one 100 KB minified line) the line cap misses — consistent with W.6's double-bound. The bound is applied at the **context boundary** in the formatter (the executor still returns the full content, like W.5/W.6); pagination — not the byte cap — is the primary bound, so `max_file_size_kb` is now a **backstop** (raised default **50 MB**) ruling out a multi-GB whole-file slurp (lower it to hard-refuse smaller files). **Line numbers are OPTIONAL, default OFF** (`show_line_numbers`): the **Step 0 finding** is that `edit_file` is **line-number-based** (`lines[N-1]=content`) while `replace_in_file` is **match-based** (regex on a search string) — a mix — so always-on numbers would corrupt copyable snippets for the match path **and** cost ~1.7× per read; the param turns absolute 1-based numbers on (aligned with `edit_file`'s addressing) for when the agent wants line refs to drive an edit. Line indexing matches `edit_file`'s `split('\n')` exactly, so the read→edit loop stays aligned. Pure helper, unit-tested on the model-facing text incl. the no-regression small-file case + the PARTIAL large-file case + rail parity + read→edit alignment (`test/read-paginate.test.js`).
+- **grep/glob results are serialized + bounded** (Task W.5, `formatGrepResult`/`formatGlobResult` in `lib/agent.js`): `formatFileResult` now has `case 'grep'`/`case 'glob'` that turn the structured engine result into model-facing text — closing a correctness bug where both fell through the default and the model received `"grep: done"`/`"glob: done"` (the data was computed and even shown in the UI, but never entered context, making grep-first navigation impossible). grep `output_mode` (`content`/`files_with_matches`/`count`) is model-selectable via the spec; `head_limit` (default `DEFAULT_GREP_HEAD_LIMIT`/`DEFAULT_GLOB_HEAD_LIMIT` = 100) + optional `offset` bound what reaches the model — the engine's 1000/5000 internal caps were never a context bound (the result was dropped before it reached context). Over-limit serialization carries a truncation notice telling the agent how to narrow (refine the pattern, switch to `count`/`files_with_matches`, or raise `head_limit`); under-limit results show fully with no notice. The executors (`lib/tool_registry.js`) normalize and attach `output_mode`/`head_limit`/`offset` onto the result; the serializers are pure and tested on the **model-facing** text (`test/grep-glob-serialize.test.js`, incl. the real-loop regression).
+- **Tool output enters context ONLY via the `boundToolOutput` chokepoint** (Task W.9, `lib/agent.js`): the size analogue of the `resolveSandboxedSpawn` sandbox chokepoint. W.5–W.8 each bounded a previously-unbounded path, but the `capToTokens`-+-fence step was duplicated ad-hoc in five places — the original bugs (grep/glob `"done"`, shell/MCP/subagent unbounded) were all the **same class**: a path that put output into context without bounding it. `boundToolOutput(text, { budget, notice, fenced })` is the **single application point**: it applies `capToTokens` with the path's **budget** and **notice** function and (when `fenced`) wraps in the `<<<UNTRUSTED_EXTERNAL_CONTENT>>>` fence. **grep/glob, shell, read_file, MCP, subagent — and http_get/web_search — all route through it.** The per-path policy is **deliberately distinct and NOT flattened**: budgets (MCP 10k < subagent 20k < read 25k; shell 10k; grep/glob `DEFAULT_GREP_GLOB_MAX_TOKENS` 10k — a new token net so a few huge minified match lines can't blow context, the W.6 lesson applied to grep's count-bound), notice wording (shell teaches redirect→grep, read teaches narrow-the-range, …), and the fence flag (MCP/subagent/web fenced; file/shell not). **Refactor-safe:** model-facing outputs are byte-identical to W.5–W.8 (the W.5–W.8 test suites pass unchanged); http_get/web_search bodies are already token-capped upstream so they pass **no budget** (fence only). **Structural regression prevention:** a new tool gets bounding by *routing* its output through the chokepoint, not by *remembering* to cap. Pure helper, unit-tested on the chokepoint behavior, per-path policy, the bound-by-construction invariant, and equivalence (`test/output-chokepoint.test.js`). The system prompt's `LOCAL_NAVIGATION_NOTICE` (`lib/prompts.js`, both templates) — now actionable post-W.5 — steers the grep-first / read-slice pattern: locate with `grep`/`glob` (`count`/`files_with_matches` modes), then `read_file` only the relevant `start_line`/`end_line` slice; redirect large command output to a file and grep it.
+- **Bounded agent iterations**: the primary loop caps at `config.max_iterations` (default 125, via `DEFAULT_MAX_ITERATIONS` in `constants.js`), overridable with `--max-iterations <n>`; `--max-iterations 0`/`"unlimited"` removes the cap deliberately. Reaching the cap stops gracefully (clear message + `stopReason: "max_iterations"`), never silently. Subagents have their own cap of 12.
+- **Malformed tags are skipped**: each tool dispatch in the agent loop is wrapped in try/catch; errors emit a warning line and continue to the next tool call.
+---
+## Deferred / Not Yet Implemented
+This section exists because false documentation has burned this project before (a
+"max 10 iterations" invariant that never existed; coverage assumed but absent). The
+items below are things a reader might reasonably expect from the docs or from peer
+tools but that the code **does not do today**. They are listed honestly so nobody
+builds on a feature that isn't there. Each is marked **Planned (Phase 4+)** —
+on the roadmap — or **Out of scope** — no current plan.
+**Gaps the re-audit found in existing behavior:**
+- **MCP in headless / one-shot** — *Planned (Phase 4+).* `connectAll()` runs only in
+  interactive `cmdChat` (and the `mcp` management commands); `code`/`edit`/`shell`/`-p`
+  never connect a manager, so MCP tools are unavailable there. See **MCP Client → Scope**.
+- **Session auto-resume** — *Planned (Phase 4+).* Sessions are saved, but there is no
+  startup prompt offering to resume the most recent (< 24 h) session. Resume is always
+  explicit: `/history` (local) or `--resume <id>` (dashboard). See **Session Storage**.
+- **Corporate-proxy consumption** — *Planned (Phase 4+).* `HTTPS_PROXY`/`HTTP_PROXY`
+  are parsed into config but `api.js` does not route requests through a proxy agent,
+  so they have no effect on outbound HTTP. See **Config hierarchy → Environment**.
+**Phase 4 roadmap (Planned, in the stated order):**
+- **Per-pattern permissions** — ✅ **Done (Task 4.1).** Rich allow/deny/ask rules
+  matching tool + argument (glob/regex), layered user→project. See **Per-Pattern
+  Permissions** above.
+- **Self-verification** — ✅ **Done (Task 4.2).** When the agent declares done,
+  optionally run a configured verify command (advisory feeds the result back;
+  enforcing returns the agent to the loop until verify passes, bounded by
+  `max_attempts` → `verify_failed`). See **Self-Verification** above.
+- **Checkpoints / rewind** — ✅ **Done (Task 4.3 file half + Task 4.3b
+  conversation + restore re-validation).** Per-write file snapshots before each
+  file-tool mutation; `/rewind` restores prior content (last or to a chosen
+  sequence), with delete/move handled and an external-modification check that never
+  silently clobbers out-of-band edits. **File-tool changes only — shell side
+  effects are not reversible.** Task 4.3b closed the last deferred 4.3 security
+  finding (the restore path now **re-validates the current
+  isPathSafe/secret/protected-config/`deny`-rule guards** per target — `force`
+  overrides only the external-mod check) and added **three restore modes**
+  (`code`/`conversation`/`both`, default both) using the existing turn-linkage,
+  with conversation truncation cutting on **turn boundaries** (no orphaned
+  `tool_call`; discard policy) on the **unchanged** on-disk schema. Rewind stays
+  **human-only** (no rewind tool registered). See **Checkpoints & Rewind** above.
+- **OS sandbox** — ✅ **Done (Task 4.4 filesystem + Task 4.4b network).** Real
+  OS-level confinement for shell commands: Seatbelt (macOS) / bubblewrap
+  (Linux/WSL2) jail every command and its children, confining writes to the working
+  dir and keeping `~/.semalt-ai`/secrets/`/etc` read-only (incl. not-yet-existing
+  files), with a fail-safe ask-or-block fallback when the primitive is absent and no
+  model-reachable way to disable it. **Network isolation is now done as well —
+  binary on/off** (bwrap `--unshare-net` / Seatbelt `(deny network*)`), no host
+  proxy / no domain allowlist / no TLS interception, anti-fail-open default. See
+  **OS Sandbox** above.
+**Done since:**
+- **Native git tooling** — ✅ **Done (Task 5.1).** Eight first-class git tools
+  (`git_status`/`git_diff`/`git_log` read-only; `git_add`/`git_commit`/`git_branch`/
+  `git_checkout` mutating; `git_worktree` infrastructure) shelling out through the
+  sandbox + deny-list chokepoint with structured results. The long tail stays in the
+  generic shell. See **Native Git Tools** above.
+- **Embedding SDK** — ✅ **Done (Task 5.2).** Two-tier library surface separated by
+  `package.json` `exports`: the stable `createAgent` facade (main entry) and the
+  unstable building blocks (`/internals`). Programmatic permission policy that
+  defaults to refusing mutations; sandbox/deny-list stay on with explicit opt-out;
+  `close()` teardown; per-instance config (process-global limits documented). See
+  **Embedding SDK** above.
+- **Background tasks** — ✅ **Done (Task 5.3).** `run --background` launches a
+  detached agent process (own process = own global state, reusing the
+  `createAgent` facade) with a launch-fixed, refuse-by-default policy and
+  sandbox/deny-list on; a file-based task registry (`~/.semalt-ai/tasks/`) drives
+  `tasks list|status|result|kill|prune`. Validation runs before detach (no
+  orphans); stale/dead tasks are detectable and prunable; kill tree-kills by PID.
+  Background-launch is intentionally NOT an agent tool. See **Background Tasks**
+  above.
+- **Multimodal image input** — ✅ **Done (Task 5.4).** PNG/JPEG/WebP/GIF attach via
+  `--image` (repeatable), in-chat `/image`, and the SDK `images` option; read
+  through `isPathSafe`, size-capped (`image_max_bytes`), base64-encoded, media
+  type detected from magic bytes. The provider content-part shape (Anthropic-style
+  vs OpenAI-style) is selected per profile/heuristic; a text-only model fails loud
+  (the image is never silently dropped). PDF input deferred; generation out of
+  scope. See **Multimodal Image Input** above.
+**Planned, not yet scheduled:**
+- **Cost caps** — hard spend limits per session/turn (today cost is *displayed* via
+  `lib/pricing.js`, never enforced).
+- **Auto-update** — self-updating the CLI (today: `npm install -g` manually).
+- **XDG / `%APPDATA%` config dirs** — honoring platform config-dir conventions instead
+  of the fixed `~/.semalt-ai/`.
+- **Domain-allowlist network policy** — *deliberately deferred, may stay out of
+  scope.* Task 4.4b ships **binary** network isolation (on / kernel-level none); a
+  per-domain allowlist ("allow github.com, block the rest") is **not** implemented
+  and is **not** a planned increment by default. **Rationale:** domain-granularity
+  requires a host-side egress proxy with full network privileges, which is the
+  exact design the reference implementation shipped and that was **bypassed
+  completely, twice, over 5.5 months** (allowedDomains fail-open CVE-2025-66479, a
+  hostname-parser differential, and TLS-MITM breaking Go binaries). We will only
+  revisit this if it can be done **without** a host proxy / TLS interception (e.g.
+  a kernel/eBPF egress filter on resolved IPs) — until then, binary isolation is
+  the robust posture. See **OS Sandbox → Why binary**.
+- **Native-Windows / WSL1 sandbox** — no OS primitive today (bwrap needs the
+  user/mount namespaces WSL1 lacks; native Windows has none). On those platforms
+  the sandbox degrades to the fail-safe fallback (ask-or-block); the Windows
+  deny-list (now covered, Task 4.4) is the remaining shell guard there.
+**Out of scope (no current plan):**
+- **Multimodal — image *input*** is ✅ **Done (Task 5.4)** — PNG/JPEG/WebP/GIF
+  attached via `--image` / `/image` / the SDK `images` option, sent provider-
+  specifically to vision models (text-only models fail loud). See **Multimodal
+  Image Input** above. Still out of scope: **PDF input** (deferred), **audio
+  input**, and **image/audio *generation* / output**.
+- **Background / cloud / scheduling** — long-running background agents, cloud execution,
+  or cron-style scheduling.
+- **OpenTelemetry** — OTel traces/metrics export.
+- **Managed policy** — centrally-administered org policy enforcement.
+- **Native notifications** — OS-level desktop notifications.
+---

package/examples/embed.js ADDED Viewed

@@ -0,0 +1,74 @@
+#!/usr/bin/env node
+'use strict';
+// ---------------------------------------------------------------------------
+// Embedding SDK example (Task 5.2)
+// ---------------------------------------------------------------------------
+//
+// Shows the supported, stable way to embed the agent in another program via the
+// `createAgent` facade: a permission policy that defaults safe, streaming
+// events, the structured run result, and the required close() teardown.
+//
+// Run it against any OpenAI-compatible endpoint:
+//
+//     SEMALT_API_BASE=http://127.0.0.1:8800 \
+//     SEMALT_API_KEY=sk-… \
+//     SEMALT_MODEL=my-model \
+//     node examples/embed.js "List the files in this directory"
+//
+// (From outside this repo, `require('@semalt-ai/code')` instead of the relative
+// path below.)
+const { createAgent } = require('../lib/sdk'); // → require('@semalt-ai/code')
+async function main() {
+  const prompt = process.argv.slice(2).join(' ') || 'Say hello and tell me what tools you have.';
+  const agent = createAgent({
+    apiBase: process.env.SEMALT_API_BASE || 'http://127.0.0.1:8800',
+    apiKey:  process.env.SEMALT_API_KEY  || 'any',
+    model:   process.env.SEMALT_MODEL    || 'default',
+    // Permission policy. With NONE of these, the SDK refuses every mutating
+    // tool (the safe default). Here we approve read-only-ish work but veto
+    // anything destructive — your host decides.
+    approve: async ({ tag, description }) => {
+      const denied = new Set(['delete_file', 'remove_dir', 'move_file']);
+      const ok = !denied.has(tag);
+      console.error(`[approve] ${ok ? 'ALLOW' : 'DENY '} ${tag} — ${description}`);
+      return ok;
+    },
+    // The OS sandbox + deny-list stay ON by default. To run unsandboxed when the
+    // kernel primitive is missing you'd opt in explicitly, e.g.:
+    //   sandbox: { mode: 'off' },
+    //   onUnsandboxed: async () => true,
+  });
+  // Stream activity (advisory — the run result is authoritative).
+  agent.on('token', (t) => process.stdout.write(t));
+  agent.on('tool', (e) => console.error(`\n[tool] ${e.tag} (${e.ms}ms)`));
+  agent.on('warning', (m) => console.error(`[warn] ${m}`));
+  try {
+    const res = await agent.run(prompt);
+    console.log('\n\n--- result ---');
+    console.log(res.result);
+    console.log('--- meta ---');
+    console.log(JSON.stringify({
+      toolCalls: res.toolCalls.length,
+      usage: res.usage,
+      cost: res.cost,
+      stopReason: res.stopReason,
+      verifyStatus: res.verifyStatus,
+    }, null, 2));
+  } finally {
+    // ALWAYS close — releases MCP connections / spawned processes.
+    await agent.close();
+  }
+}
+main().catch((err) => {
+  console.error('embed example failed:', err.message);
+  process.exit(1);
+});