npm - @semalt-ai/code - Versions diffs - 1.8.5 → 1.20.0 - Mend

@semalt-ai/code 1.8.5 → 1.20.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (192) hide show

package/.claude/settings.local.json +7 -1
package/.github/workflows/ci.yml +69 -0
package/ARCHITECTURE.md +6 -95
package/CLAUDE.md +196 -316
package/README.md +148 -4
package/docs/ARCHITECTURE.md +1321 -0
package/docs/CONFIG.md +340 -0
package/docs/HISTORY.md +245 -0
package/examples/embed.js +74 -0
package/index.js +251 -10
package/lib/agent.js +856 -120
package/lib/api.js +239 -50
package/lib/args.js +74 -2
package/lib/audit.js +23 -1
package/lib/background.js +584 -0
package/lib/checkpoints.js +757 -0
package/lib/commands/auth.js +94 -0
package/lib/commands/chat-session.js +489 -0
package/lib/commands/chat-slash.js +415 -0
package/lib/commands/chat-turn.js +669 -0
package/lib/commands/chat.js +407 -0
package/lib/commands/custom.js +157 -0
package/lib/commands/history-utils.js +66 -0
package/lib/commands/index.js +268 -0
package/lib/commands/mcp.js +113 -0
package/lib/commands/oneshot.js +193 -0
package/lib/commands/registry.js +269 -0
package/lib/commands/tasks.js +89 -0
package/lib/compact.js +87 -0
package/lib/config.js +360 -11
package/lib/constants.js +401 -3
package/lib/deny.js +199 -0
package/lib/doctor.js +160 -0
package/lib/headless.js +202 -0
package/lib/hooks.js +286 -0
package/lib/images.js +270 -0
package/lib/internals.js +49 -0
package/lib/mcp/boundary.js +131 -0
package/lib/mcp/client.js +270 -0
package/lib/mcp/oauth.js +134 -0
package/lib/memory.js +209 -0
package/lib/metrics.js +37 -2
package/lib/payload.js +54 -0
package/lib/permission-rules.js +401 -0
package/lib/permissions.js +123 -26
package/lib/pricing.js +67 -0
package/lib/proc.js +62 -0
package/lib/prompts.js +99 -8
package/lib/sandbox.js +568 -0
package/lib/sdk.js +328 -0
package/lib/secrets.js +211 -0
package/lib/skills.js +223 -0
package/lib/subagents.js +516 -0
package/lib/tool_registry.js +2862 -0
package/lib/tool_specs.js +263 -9
package/lib/tools.js +352 -1039
package/lib/ui/anim.js +86 -0
package/lib/ui/ansi.js +17 -27
package/lib/ui/chat-history.js +253 -71
package/lib/ui/create-ui.js +67 -24
package/lib/ui/diff.js +90 -25
package/lib/ui/file-activity.js +236 -0
package/lib/ui/format.js +195 -29
package/lib/ui/input-field.js +21 -11
package/lib/ui/md-stream.js +234 -0
package/lib/ui/render-operation.js +113 -0
package/lib/ui/select.js +1 -4
package/lib/ui/status-bar.js +146 -36
package/lib/ui/stream.js +20 -13
package/lib/ui/theme.js +190 -44
package/lib/ui/tool-operation.js +190 -0
package/lib/ui/utils.js +9 -5
package/lib/ui/web-activity.js +270 -0
package/lib/ui/writer.js +159 -45
package/lib/ui.js +1 -1
package/lib/verify.js +229 -0
package/lib/web-extract.js +213 -0
package/lib/web-summarize.js +68 -0
package/package.json +19 -4
package/scripts/lint.js +57 -0
package/test/agent-loop.test.js +389 -0
package/test/anim-driver.test.js +153 -0
package/test/ask-user-display.test.js +226 -0
package/test/ask-user-gate.test.js +231 -0
package/test/background.test.js +414 -0
package/test/chat-history-nocolor.test.js +155 -0
package/test/chat-relogin.test.js +207 -0
package/test/chat.test.js +114 -0
package/test/checkpoints-agent.test.js +181 -0
package/test/checkpoints.test.js +650 -0
package/test/command-registry.test.js +160 -0
package/test/compact.test.js +116 -0
package/test/completion-lazy.test.js +52 -0
package/test/config-merge.test.js +324 -0
package/test/config-quarantine.test.js +128 -0
package/test/config-write-guard-allow-anywhere.test.js +56 -0
package/test/config-write-guard-skip.test.js +46 -0
package/test/config-write-guard.test.js +153 -0
package/test/context-split.test.js +215 -0
package/test/cost-doctor.test.js +142 -0
package/test/custom-commands-chat.test.js +106 -0
package/test/custom-commands.test.js +230 -0
package/test/defer-detail-band.test.js +403 -0
package/test/deny-windows.test.js +120 -0
package/test/deny.test.js +83 -0
package/test/detail-band-tab-flatten.test.js +242 -0
package/test/download-allow-anywhere.test.js +66 -0
package/test/download-confine.test.js +153 -0
package/test/exec-diff.test.js +268 -0
package/test/executors.test.js +599 -0
package/test/extract-tool-calls.test.js +349 -0
package/test/fetch-url-validation.test.js +219 -0
package/test/file-activity.test.js +522 -0
package/test/fixtures/tool-calls.js +57 -0
package/test/fixtures/web-page.js +91 -0
package/test/git-tools.test.js +384 -0
package/test/grep-glob-serialize.test.js +242 -0
package/test/grep-glob.test.js +268 -0
package/test/grep-path-target.test.js +227 -0
package/test/harness/README.md +57 -0
package/test/harness/chat-harness.js +143 -0
package/test/harness/memwarn-headless-child.js +65 -0
package/test/harness/mock-llm.js +120 -0
package/test/harness/mock-mcp-server.js +142 -0
package/test/harness/sse-server.js +69 -0
package/test/headless.test.js +348 -0
package/test/history-utils.test.js +88 -0
package/test/hooks-agent.test.js +238 -0
package/test/hooks-verify-sandbox.test.js +232 -0
package/test/hooks.test.js +216 -0
package/test/http-get-user-agent.test.js +142 -0
package/test/images-api.test.js +208 -0
package/test/images.test.js +238 -0
package/test/input-field-ctrl-o.test.js +37 -0
package/test/live-height-physical.test.js +281 -0
package/test/max-iterations.test.js +218 -0
package/test/mcp-boundary.test.js +57 -0
package/test/mcp-client.test.js +267 -0
package/test/mcp-oauth.test.js +86 -0
package/test/md-stream.test.js +183 -0
package/test/memory-truncation-warning.test.js +222 -0
package/test/memory.test.js +198 -0
package/test/native-dispatch.test.js +409 -0
package/test/native-live-narration.test.js +254 -0
package/test/output-chokepoint.test.js +188 -0
package/test/output-heredoc-leak.test.js +195 -0
package/test/output-preview.test.js +245 -0
package/test/path-guards.test.js +134 -0
package/test/payload.test.js +99 -0
package/test/permission-rules-agent.test.js +210 -0
package/test/permission-rules.test.js +297 -0
package/test/permissions.test.js +362 -0
package/test/plan-mode.test.js +167 -0
package/test/read-paginate.test.js +275 -0
package/test/readonly-tools.test.js +177 -0
package/test/render-operation.test.js +317 -0
package/test/replay-descriptor-xml.test.js +216 -0
package/test/replay-descriptor.test.js +189 -0
package/test/replay-web-aggregate.test.js +291 -0
package/test/replay-web-persist.test.js +241 -0
package/test/result-cap.test.js +233 -0
package/test/running-glyph-anim.test.js +111 -0
package/test/sandbox-agent.test.js +147 -0
package/test/sandbox-integration.test.js +216 -0
package/test/sandbox.test.js +408 -0
package/test/sdk.test.js +234 -0
package/test/shell-output-cap.test.js +181 -0
package/test/skills-chat.test.js +110 -0
package/test/skills.test.js +295 -0
package/test/smoke.test.js +68 -0
package/test/status-bar-driver.test.js +93 -0
package/test/status-bar-pause.test.js +164 -0
package/test/status-bar-resync.test.js +188 -0
package/test/stream-parser.test.js +171 -0
package/test/subagents-agent.test.js +178 -0
package/test/subagents.test.js +222 -0
package/test/theme-palette.test.js +166 -0
package/test/tool-registry.test.js +85 -0
package/test/trim-budget.test.js +101 -0
package/test/truncate-visible.test.js +78 -0
package/test/verify-agent.test.js +317 -0
package/test/verify.test.js +141 -0
package/test/view-image.test.js +199 -0
package/test/web-activity-ordering.test.js +203 -0
package/test/web-activity.test.js +207 -0
package/test/web-data-extraction-guidance.test.js +71 -0
package/test/web-extract.test.js +185 -0
package/test/web-fetch-agent.test.js +291 -0
package/test/web-fetch-mode.test.js +193 -0
package/test/web-search.test.js +380 -0
package/lib/commands.js +0 -1438
package/path +0 -1

package/test/replay-web-persist.test.js ADDED Viewed

@@ -0,0 +1,241 @@
+'use strict';
+// Output Refactor — Phase 6c-i: persist web-op cores, with ZERO visible change.
+//
+// Web tools (web_search/http_get) are intercepted in chat-turn.js BEFORE they
+// become a normal ToolOperation descriptor (they collapse into the live web
+// summary). Before 6c-i the interception returned `undefined`, so the agent
+// loop's `displayCore || null` push stored a `null` slot and the web op fell to
+// the legacy whole-blob / summarizeToolResult fallback on replay. 6c-i makes the
+// interception return a dedicated web-op core `{v:1,kind:'web',…}` that the slot
+// now carries on BOTH rails — while every replay reader is taught to treat a
+// web-core as "no descriptor → fallback". Net contract: replay output is
+// BYTE-IDENTICAL to today. Aggregation lands in 6c-ii, not here.
+//
+// These tests pin:
+//   1. POSITIVE — serializeWebOp sources fields from ctx; the core lands in the
+//      slot on the native {role:'tool'} message AND the XML `_display[]` array.
+//   2. 6c-ii FLIP (headline) — replay of a native web message and of an XML blob
+//      containing a web op now AGGREGATES the web-core(s) into the committed
+//      `✓ web · …` summary, byte-identical to the live committed line. (These were
+//      6c-i "no-op replay" cases; 6c-ii deliberately makes web activity visible on
+//      replay. Persistence/shape/inv.1/live-tracker pins below are unchanged.)
+//   3. INV.1 — `content` byte-identical; the web-core is never folded into content.
+//   4. ANTI-PING-PONG — fresh live web display unchanged; 6a/6b normal-tool parity
+//      and Phase 1 fresh-render still byte-identical.
+const { test } = require('node:test');
+const assert = require('node:assert');
+// Force a colour-capable env so byte comparisons are stable (node:test runs each
+// file in its own process — no leak to other suites). Mirrors the 6a/6b suites.
+process.stdout.isTTY = true;
+delete process.env.NO_COLOR;
+const { buildToolOperation, serializeOperation, descriptorFromStored } = require('../lib/ui/tool-operation');
+const { renderOperation } = require('../lib/ui/render-operation');
+const { ChatHistory } = require('../lib/ui/chat-history');
+const { createChatSession } = require('../lib/commands/chat-session');
+const {
+  serializeWebOp,
+  isWebCore,
+  aggregateWebOps,
+  webSummaryText,
+  createWebActivityTracker,
+} = require('../lib/ui/web-activity');
+const stripAnsi = (s) => String(s).replace(/\x1b\[[0-9;]*m/g, '');
+const CFG = { diff_max_lines: 50, shell_preview_lines: 5 };
+// A normal (non-web) fixture used to prove a MIXED blob (normal + web) still
+// drops to the whole-blob summary — i.e. the gate fails on the web slot.
+const EDIT_DIFF = buildToolOperation({
+  id: 'tool-1', tag: 'edit_file', arg: 'lib/x.js', attrs: { path: 'lib/x.js' },
+  status: 'ok', durationMs: 12, diff: { before: 'a\nb\nc\n', after: 'a\nB\nc\n', path: 'lib/x.js' },
+});
+// Tool ctx objects exactly as onToolEnd receives them: { attrs, meta, error }.
+// web_search with a query + a 200/bytes meta; http_get error case with a url and
+// a transport error (no status/bytes).
+const SEARCH_CTX = { attrs: { query: 'коррупционные скандалы 2024' }, meta: { status_code: 200, bytes: 4096 }, error: null };
+const FETCH_ERR_CTX = { attrs: { url: 'https://example.com/blocked' }, meta: null, error: { message: 'Request timeout' } };
+// ── Faithful models of the agent.js per-rail persistence of onToolEnd's return.
+// Native: _nativeToolMessage attaches `_display` only when truthy (agent.js:352-355),
+// receiving the already-`|| null`-ed slot. XML: messages.push({…, _display:
+// displayCores.slice()}) with each slot = `displayCore || null` (agent.js:1959-1963).
+function nativeToolMsg(content, core) {
+  const slot = core || null;            // the agent loop's `displayCore || null` push
+  const msg = { role: 'tool', content };
+  if (slot) msg._display = slot;        // _nativeToolMessage: attach only when truthy
+  return msg;
+}
+function xmlBlob(results, cores) {
+  const m = {
+    role: 'user',
+    content: `Tool execution results:\n\n${results.join('\n\n')}\n\nContinue with the task. If everything is done, summarize what was accomplished.`,
+  };
+  if (cores !== undefined) m._display = cores.map((c) => c || null); // `displayCore || null` per slot
+  return m;
+}
+// Drive the REAL replay (chat-session.displayLoadedMessages) over one loaded
+// message and capture exactly what is committed to scrollback.
+function replay(loadedMessage, cfg) {
+  const ch = new ChatHistory();
+  const out = [];
+  ch._commit = (t) => out.push(t);
+  const session = createChatSession({ chatHistory: ch, getConfig: () => cfg || CFG });
+  session.displayLoadedMessages([loadedMessage]);
+  return out.join('');
+}
+// ───────────────────────────────────────────────────────────────────────────
+// 1. POSITIVE — serializeWebOp sources fields from ctx, and the core lands in the
+//    slot on BOTH rails (guards web-op persistence on native + XML).
+// ───────────────────────────────────────────────────────────────────────────
+test('serializeWebOp: sources query/status/bytes (search) and url/error (fetch) from ctx, flat for aggregateWebOps (guards core shape)', () => {
+  const search = serializeWebOp(SEARCH_CTX, 'web_search', 300);
+  assert.deepStrictEqual(search, {
+    v: 1, kind: 'web', tag: 'web_search',
+    query: 'коррупционные скандалы 2024', url: undefined,
+    status: 200, bytes: 4096, error: undefined, durationMs: 300,
+  });
+  const fetch = serializeWebOp(FETCH_ERR_CTX, 'http_get', 50);
+  assert.deepStrictEqual(fetch, {
+    v: 1, kind: 'web', tag: 'http_get',
+    query: undefined, url: 'https://example.com/blocked',
+    status: undefined, bytes: undefined, error: 'Request timeout', durationMs: 50,
+  });
+  // The flat fields are exactly what aggregateWebOps reads (so 6c-ii can feed
+  // these cores directly): 1 search query, and the errored fetch shows as blocked.
+  const text = webSummaryText(aggregateWebOps([search, fetch]));
+  assert.match(text, /search "коррупционные/);
+  assert.match(text, /0 sources read/);
+  assert.match(text, /1 blocked/);
+  // Both are recognized as web-cores; a normal descriptor core is NOT.
+  assert.ok(isWebCore(search) && isWebCore(fetch), 'web-cores recognized');
+  assert.ok(!isWebCore(serializeOperation(EDIT_DIFF)), 'a normal descriptor core is not a web-core');
+});
+test('persistence: the web-op core lands in the slot on the native {role:tool} message AND the XML _display[] array (guards both rails)', () => {
+  const core = serializeWebOp(SEARCH_CTX, 'web_search', 300);
+  // Native: truthy core → attached verbatim as `_display`.
+  const native = nativeToolMsg('web search "…" (5 results)', core);
+  assert.deepStrictEqual(native._display, core, 'native {role:tool}._display carries the web-core');
+  // XML: the slot that used to be `null` now holds the web-core.
+  const blob = xmlBlob(['edited lib/x.js', 'fetched'], [serializeOperation(EDIT_DIFF), core]);
+  assert.deepStrictEqual(blob._display[1], core, 'XML _display[] slot carries the web-core (no longer null)');
+});
+// ───────────────────────────────────────────────────────────────────────────
+// 2. 6c-ii FLIP (headline visible change) — replay now AGGREGATES web-cores into
+//    the committed `✓ web · …` summary instead of the legacy fallback. These two
+//    were the 6c-i "no-op replay" cases; 6c-ii deliberately makes them visible.
+//    Full oracle-parity coverage (interleaving, cross-iteration, both rails) lives
+//    in test/replay-web-aggregate.test.js — these pin the persist-file's own rails.
+// ───────────────────────────────────────────────────────────────────────────
+// Drive the live tracker over a single op and return its committed summary line —
+// the byte-exact oracle a replayed web summary must reproduce.
+function liveOracle(tag, input, ctx, durationMs) {
+  const frames = [];
+  const tracker = createWebActivityTracker({ writerModule: {
+    startActivity: () => {}, updateActivity: () => {},
+    endActivity: (_id, line) => frames.push(line),
+  } });
+  tracker.start(tag, input);
+  tracker.end(tag, 'done', durationMs, ctx);
+  tracker.flush();
+  return frames[frames.length - 1];
+}
+test('6c-ii: a native web {role:tool} message now replays as the aggregated web summary, byte-identical to live (flip of 6c-i invisibility)', () => {
+  const core = serializeWebOp(FETCH_ERR_CTX, 'http_get', 120);
+  const oracle = liveOracle('http_get', 'https://example.com/blocked', FETCH_ERR_CTX, 120);
+  const committed = replay(nativeToolMsg('web · GET https://example.com\n<page body…>', core));
+  assert.strictEqual(committed, oracle, 'native web message replays as the aggregated summary, byte-identical to the live committed line');
+  // Not vacuous, and it IS the aggregated web summary (not the legacy fallback).
+  assert.match(stripAnsi(committed), /web/);
+  assert.match(stripAnsi(committed), /1 blocked/);
+});
+test('6c-ii: an XML blob mixing a normal slot + a web-op slot now renders the normal op per-slot AND aggregates the web op below it (gate passes)', () => {
+  const core = serializeWebOp(SEARCH_CTX, 'web_search', 300);
+  const results = ['edited lib/x.js', 'web search results…'];
+  // The normal slot alone replays per-slot (6b path) — same render the mixed blob
+  // must reproduce for its non-web slot.
+  const editOnly = replay(xmlBlob(results, [serializeOperation(EDIT_DIFF)]));
+  const webOracle = liveOracle('web_search', SEARCH_CTX.attrs.query, SEARCH_CTX, 300);
+  const mixed = replay(xmlBlob(results, [serializeOperation(EDIT_DIFF), core]));
+  assert.strictEqual(mixed, editOnly + webOracle, 'normal slot renders per-slot; the web slot aggregates into the summary committed below it');
+  // The non-web slot IS now rendered individually (its diff body appears) — the
+  // 6c-i whole-blob fallback no longer applies once every slot is a valid core.
+  const diffBody = renderOperation(descriptorFromStored(serializeOperation(EDIT_DIFF)), { mode: 'ansi', phase: 'detail', maxLines: 50 });
+  assert.ok(diffBody.length > 0 && mixed.includes(diffBody), 'the non-web slot is rendered per-slot (no whole-blob fallback)');
+  // And the web op is no longer hidden: its summary is visible.
+  assert.match(stripAnsi(mixed), /web/);
+});
+// ───────────────────────────────────────────────────────────────────────────
+// 3. INV.1 — `content` is byte-identical; the web-core never enters `content`.
+// ───────────────────────────────────────────────────────────────────────────
+test('inv.1: attaching the web-core leaves the XML feedback content byte-identical and out of the model-facing string (guards the chokepoint)', () => {
+  const core = serializeWebOp(SEARCH_CTX, 'web_search', 300);
+  const results = ['edited lib/x.js', 'web search results…'];
+  const expectedContent = `Tool execution results:\n\n${results.join('\n\n')}\n\nContinue with the task. If everything is done, summarize what was accomplished.`;
+  const blob = xmlBlob(results, [serializeOperation(EDIT_DIFF), core]);
+  assert.strictEqual(blob.content, expectedContent, 'content equals the results.join-wrapped string, byte-for-byte');
+  assert.ok(!blob.content.includes('kind'), 'no web-core framing leaked into content');
+  assert.ok(!blob.content.includes('коррупционные'), 'the query lives only in the core, not content');
+  // Native rail: the web-core is a sibling key, never inside `content`.
+  const native = nativeToolMsg('body', core);
+  assert.ok(!native.content.includes('kind') && !native.content.includes('web'), 'native content carries no core framing');
+});
+// ───────────────────────────────────────────────────────────────────────────
+// 4. ANTI-PING-PONG — live web display unchanged; 6a/6b + Phase 1 still identical.
+// ───────────────────────────────────────────────────────────────────────────
+test('anti-ping-pong: the live web tracker render is unchanged — nothing in the live path reads serializeWebOp\'s return (guards live region)', () => {
+  // A fake writer captures exactly what the live tracker renders.
+  const frames = [];
+  const writerModule = {
+    startActivity: (_id, fn) => frames.push(stripAnsi(fn(0))),
+    updateActivity: (_id, fn) => frames.push(stripAnsi(fn(0))),
+    endActivity: (_id, line) => frames.push(stripAnsi(line)),
+  };
+  const tracker = createWebActivityTracker({ writerModule });
+  tracker.start('web_search', 'коррупционные скандалы 2024');
+  tracker.end('web_search', 'ok', 300, SEARCH_CTX);
+  // The interception ALSO calls serializeWebOp — assert calling it does not alter
+  // what the tracker subsequently renders (the return value is persistence-only).
+  serializeWebOp(SEARCH_CTX, 'web_search', 300);
+  tracker.flush();
+  const committed = frames[frames.length - 1];
+  assert.match(committed, /web/);
+  assert.match(committed, /search/);
+  // The committed line is exactly the tracker's own summary — untouched by 6c-i.
+  assert.strictEqual(committed, stripAnsi(require('../lib/ui/web-activity').formatWebSummaryLine(aggregateWebOps([
+    { tag: 'web_search', query: 'коррупционные скандалы 2024', status: 200, bytes: 4096 },
+  ]), { pending: false })));
+});
+test('anti-ping-pong: native (6a) normal-tool round-trip and Phase 1 fresh-render bytes unchanged (guards no regression)', () => {
+  // 6a: a normal descriptor still round-trips byte-identical (the web guard does
+  // not touch the non-web path).
+  const restored = descriptorFromStored(serializeOperation(EDIT_DIFF));
+  assert.strictEqual(
+    renderOperation(restored, { mode: 'ansi' }),
+    renderOperation(EDIT_DIFF, { mode: 'ansi' }),
+    'native rail result line round-trips byte-identical',
+  );
+  // Phase 1: the canonical fresh shell line bytes are pinned (same oracle as 6b).
+  const shellOk = renderOperation(
+    buildToolOperation({ status: 'success', tag: 'shell', arg: 'npm install', attrs: { command: 'npm install' }, durationMs: 2300, meta: { exit_code: 0 } }),
+    { mode: 'ansi', phase: 'result' },
+  );
+  assert.strictEqual(
+    shellOk,
+    '  \x1b[38;5;40m✓\x1b[0m \x1b[38;5;214mshell\x1b[0m \x1b[2m·\x1b[0m \x1b[38;5;214mnpm install\x1b[0m \x1b[2m·\x1b[0m \x1b[38;5;244m2.3s\x1b[0m \x1b[2m·\x1b[0m \x1b[38;5;244mexit 0\x1b[0m',
+  );
+});

package/test/result-cap.test.js ADDED Viewed

@@ -0,0 +1,233 @@
+'use strict';
+// Task W.8 — Cap MCP & subagent output entering context.
+//
+// THE CHANGE these tests pin: MCP tool results (lib/mcp/client.js) and subagent
+// final text (lib/subagents.js) were the last two UNBOUNDED paths into context —
+// both fenced as untrusted, but neither token-capped. A server (MCP) or a verbose
+// child (subagent) could blow context wholesale. Both serializers now apply the
+// standard capToTokens (consistent with W.5–W.7) BEFORE wrapping the text in the
+// untrusted fence, with DIFFERENT budgets:
+//   * MCP — STRICTER (third-party, untrusted, server-controlled): the riskiest.
+//   * Subagent — GENEROUS (our own child's synthesized result): a safety net.
+// Tests assert the MODEL-FACING (and parent-facing) result: the bound, the
+// truncation notice, the fence-still-present, and that the two budgets differ.
+const { test, before, after, afterEach } = require('node:test');
+const assert = require('node:assert');
+const ui = require('../lib/ui');
+const { createApiClient } = require('../lib/api');
+const { createToolExecutor, extractToolCalls } = require('../lib/tools');
+const { createPermissionManager } = require('../lib/permissions');
+const {
+  createAgentRunner, formatMcpResult, formatSubagentResult,
+} = require('../lib/agent');
+const toolRegistry = require('../lib/tool_registry');
+const { createSubagentManager, buildSpawnAgentEntry } = require('../lib/subagents');
+const {
+  DEFAULT_MCP_MAX_RESULT_TOKENS, DEFAULT_SUBAGENT_MAX_RESULT_TOKENS,
+} = require('../lib/constants');
+const { startMockLLM } = require('./harness/mock-llm');
+const FENCE_OPEN = /<<<UNTRUSTED_EXTERNAL_CONTENT/;
+const FENCE_CLOSE = /<<<END_UNTRUSTED_EXTERNAL_CONTENT>>>/;
+// ---------------------------------------------------------------------------
+// Part A — pure model-facing serializers (formatMcpResult / formatSubagentResult)
+// ---------------------------------------------------------------------------
+test('MCP: small result passes through fully, no notice, still fenced', () => {
+  const content = 'just a small payload from the server';
+  const out = formatMcpResult({ action: 'mcp__srv__tool', content, maxTokens: 10000 });
+  assert.match(out, /MCP tool mcp__srv__tool result:/);
+  assert.match(out, FENCE_OPEN);
+  assert.match(out, FENCE_CLOSE);
+  assert.ok(out.includes(content), 'full payload present');
+  assert.doesNotMatch(out, /capped at/);
+});
+test('MCP: large result is capped with a notice, INSIDE the untrusted fence', () => {
+  const content = 'x'.repeat(4000); // ~1000 tokens
+  const out = formatMcpResult({ action: 'mcp__srv__tool', content, maxTokens: 50 });
+  assert.match(out, /capped at ~50 tokens \(was ~\d+\)/, 'truncation notice present');
+  // The capped content (and its notice) must remain BETWEEN the fence delimiters.
+  const open = out.indexOf('<<<UNTRUSTED_EXTERNAL_CONTENT');
+  const close = out.indexOf('<<<END_UNTRUSTED_EXTERNAL_CONTENT>>>');
+  const noticeAt = out.indexOf('capped at');
+  assert.ok(open >= 0 && close > open, 'fence present and well-ordered');
+  assert.ok(noticeAt > open && noticeAt < close, 'notice sits inside the fence');
+  // The full payload did NOT enter context.
+  assert.ok(out.length < content.length, 'result is shorter than the raw payload');
+});
+test('MCP: isError surfaces the error note, still fenced', () => {
+  const out = formatMcpResult({ action: 'mcp__srv__t', content: 'boom', isError: true, maxTokens: 10000 });
+  assert.match(out, /\(the tool reported an error\)/);
+  assert.match(out, FENCE_OPEN);
+});
+test('subagent: short result passes through fully, no notice, fenced', () => {
+  const content = 'CHILD FINDINGS: the project is a CLI';
+  const out = formatSubagentResult({ count: 1, content, maxTokens: 20000 });
+  assert.match(out, /Result from 1 subagent/);
+  assert.match(out, FENCE_OPEN);
+  assert.match(out, FENCE_CLOSE);
+  assert.ok(out.includes(content));
+  assert.doesNotMatch(out, /capped at/);
+});
+test('subagent: long result is capped with a notice', () => {
+  const content = 'y'.repeat(4000);
+  const out = formatSubagentResult({ count: 1, content, maxTokens: 50 });
+  assert.match(out, /capped at ~50 tokens \(was ~\d+\)/);
+  assert.match(out, FENCE_OPEN);
+  assert.match(out, FENCE_CLOSE);
+  assert.ok(out.length < content.length);
+});
+test('subagent: plural label for multiple subagents', () => {
+  const out = formatSubagentResult({ count: 3, content: 'a', maxTokens: 20000 });
+  assert.match(out, /Result from 3 subagents/);
+});
+// ---------------------------------------------------------------------------
+// Part B — the two budgets are DISTINCT and MCP is STRICTER
+// ---------------------------------------------------------------------------
+test('default budgets: MCP is strictly stricter than subagent', () => {
+  assert.ok(DEFAULT_MCP_MAX_RESULT_TOKENS < DEFAULT_SUBAGENT_MAX_RESULT_TOKENS,
+    'MCP budget must be stricter than the subagent budget');
+});
+test('budgets differ: content between the two budgets is capped under MCP but passes under subagent', () => {
+  // Size the content so its estimate is ABOVE the MCP default and BELOW the
+  // subagent default (estimate ≈ chars/4). Midpoint of the two budgets.
+  const midTokens = Math.floor((DEFAULT_MCP_MAX_RESULT_TOKENS + DEFAULT_SUBAGENT_MAX_RESULT_TOKENS) / 2);
+  const content = 'z'.repeat(midTokens * 4);
+  // No explicit maxTokens → each serializer uses ITS OWN default budget.
+  const mcp = formatMcpResult({ action: 'mcp__s__t', content });
+  const sub = formatSubagentResult({ count: 1, content });
+  assert.match(mcp, /capped at/, 'MCP caps a payload above its stricter budget');
+  assert.doesNotMatch(sub, /capped at/, 'subagent passes the same payload under its generous budget');
+});
+// ---------------------------------------------------------------------------
+// Part C — through the REAL agent loop (the wiring reads config; fence intact)
+// ---------------------------------------------------------------------------
+let prevKey;
+before(() => { prevKey = process.env.SEMALT_API_KEY; process.env.SEMALT_API_KEY = 'test-key'; });
+after(() => {
+  if (prevKey === undefined) delete process.env.SEMALT_API_KEY;
+  else process.env.SEMALT_API_KEY = prevKey;
+});
+afterEach(() => { toolRegistry.clearDynamicTools(); });
+// Build a full parent stack (api + permissions + executors + agent runner). With
+// `withSubagent` it also wires a subagent manager from the SAME building blocks
+// and registers the spawn_agent tool — mirroring test/subagents-agent.test.js.
+function buildStack(base, config, { withSubagent = false } = {}) {
+  const cfg = {
+    api_base: base, api_key: 'test-key', default_model: 'test-model',
+    temperature: 0.5, request_timeout_ms: 5000, stream: true, models: [],
+    ...config,
+  };
+  const getConfig = () => cfg;
+  const api = createApiClient({ getConfig, saveConfig: (c) => Object.assign(cfg, c), ui });
+  const pm = createPermissionManager(ui, { skipPermissions: true });
+  pm.setUICallbacks({ onAddMessage: () => {}, onShowModal: () => {}, onCloseModal: () => {}, onCaptureNavigation: () => () => {} });
+  const { agentExecShell, agentExecFile, describePermission } = createToolExecutor(pm, ui, getConfig);
+  const runner = createAgentRunner({
+    chatStream: api.chatStream, extractToolCalls, agentExecShell, agentExecFile,
+    describePermission, permissionManager: pm, ui, getConfig,
+  });
+  if (withSubagent) {
+    const manager = createSubagentManager({
+      chatStream: api.chatStream, extractToolCalls, agentExecShell, agentExecFile,
+      describePermission, permissionManager: pm, ui, getConfig,
+    });
+    toolRegistry.registerDynamicTool(buildSpawnAgentEntry(manager));
+  }
+  return { runner, getConfig, cfg };
+}
+// Register a fake MCP-style dynamic tool returning a fixed payload, so we exercise
+// the formatFileResult MCP branch WITHOUT the real SDK / a live server.
+function registerFakeMcpTool(content) {
+  toolRegistry.registerDynamicTool({
+    tool: 'mcp__test__big',
+    mcp: true,
+    server: 'test',
+    spec: { description: 'fake', parameters: { type: 'object', properties: {} } },
+    fromParams: (p) => ['mcp__test__big', p || {}],
+    parseXml: () => [],
+    permission: () => null,
+    execute: async () => ({ mcp: true, content, isError: false }),
+  });
+}
+test('real loop: a large MCP result is capped + still fenced in the tool message', async () => {
+  const mock = await startMockLLM();
+  registerFakeMcpTool('Q'.repeat(4000)); // ~1000 tokens
+  mock.replyWithToolCall('mcp__test__big', {});
+  mock.replyWith('done');
+  try {
+    const { runner } = buildStack(mock.base, { mcp: { servers: {}, max_result_tokens: 20 } });
+    const messages = [{ role: 'user', content: 'call the mcp tool' }];
+    await runner.runAgentLoop(messages, 'test-model', 5, null, { callbacks: { onError: () => {} } });
+    const toolMsg = messages.find((m) => m.role === 'tool' && /mcp__test__big/.test(m.content || ''));
+    assert.ok(toolMsg, 'MCP result fed back');
+    assert.match(toolMsg.content, FENCE_OPEN, 'still fenced after capping');
+    assert.match(toolMsg.content, FENCE_CLOSE);
+    assert.match(toolMsg.content, /capped at ~20 tokens/, 'capped at the configured MCP budget');
+    assert.ok(toolMsg.content.length < 4000, 'the full payload did not enter context');
+  } finally {
+    await mock.close();
+  }
+});
+test('real loop: a small MCP result passes through fully (paired positive), still fenced', async () => {
+  const mock = await startMockLLM();
+  registerFakeMcpTool('tiny payload');
+  mock.replyWithToolCall('mcp__test__big', {});
+  mock.replyWith('done');
+  try {
+    const { runner } = buildStack(mock.base, { mcp: { servers: {}, max_result_tokens: 10000 } });
+    const messages = [{ role: 'user', content: 'call the mcp tool' }];
+    await runner.runAgentLoop(messages, 'test-model', 5, null, { callbacks: { onError: () => {} } });
+    const toolMsg = messages.find((m) => m.role === 'tool' && /mcp__test__big/.test(m.content || ''));
+    assert.ok(toolMsg);
+    assert.match(toolMsg.content, FENCE_OPEN);
+    assert.ok(toolMsg.content.includes('tiny payload'));
+    assert.doesNotMatch(toolMsg.content, /capped at/);
+  } finally {
+    await mock.close();
+  }
+});
+test('real loop: a verbose subagent final text is capped + still fenced, isolation intact', async () => {
+  const mock = await startMockLLM();
+  const longChild = 'L'.repeat(4000); // ~1000 tokens
+  mock.replyWithToolCall('spawn_agent', { prompt: 'go research' }); // parent
+  mock.replyWith(longChild);                                        // child final
+  mock.replyWith('noted');                                         // parent final
+  try {
+    const { runner } = buildStack(mock.base,
+      { subagents: { max_concurrency: 3, max_result_tokens: 30 } }, { withSubagent: true });
+    const messages = [{ role: 'user', content: 'investigate' }];
+    await runner.runAgentLoop(messages, 'test-model', 5, null, { callbacks: { onError: () => {} } });
+    const toolMsg = messages.find((m) => m.role === 'tool' && /UNTRUSTED_EXTERNAL_CONTENT/.test(m.content || ''));
+    assert.ok(toolMsg, 'subagent result fed back fenced');
+    assert.match(toolMsg.content, FENCE_OPEN);
+    assert.match(toolMsg.content, FENCE_CLOSE);
+    assert.match(toolMsg.content, /capped at ~30 tokens/, 'capped at the configured subagent budget');
+    // Isolation unchanged: the parent did not absorb the child's long assistant turn.
+    const absorbed = messages.some((m) => m.role === 'assistant' && m.content === longChild);
+    assert.ok(!absorbed, 'the child assistant turn never lands in the parent history');
+  } finally {
+    await mock.close();
+  }
+});

package/test/running-glyph-anim.test.js ADDED Viewed

@@ -0,0 +1,111 @@
+'use strict';
+// Animated running op row (Output Refactor — Phase 3, Part 2).
+//
+// Before Phase 3 a running tool's glyph was a static dot (●, colored after
+// Phase 2.5 but not moving) and the elapsed meter only advanced as a side
+// effect of the status-bar timers firing. Phase 3 animates the running glyph:
+// the `tool` SPINNER_DEF frames in the category-tinted pending colour, with the
+// frame derived from the elapsed duration so it advances every ~100 ms as the
+// single driver repaints the row with a fresh elapsedMs. The elapsed meter
+// rides the same repaint.
+//
+// These tests exercise the render path the writer's activity region invokes:
+// renderOperation(descriptor, { phase: 'pending' }) → formatToolLine.
+const { test } = require('node:test');
+const assert = require('node:assert');
+const { SPINNER_DEFS } = require('../lib/ui/ansi');
+const { formatToolLine } = require('../lib/ui/format');
+const { renderOperation } = require('../lib/ui/render-operation');
+// Strip SGR so we can inspect the visible glyph/elapsed text.
+function plain(s) { return s.replace(/\x1b\[[0-9;]*m/g, ''); }
+function firstGlyph(s) { return plain(s).trimStart()[0]; }
+const TOOL_FRAMES = SPINNER_DEFS.tool.frames;
+// ---------------------------------------------------------------------------
+// The running glyph cycles the tool spinner frames as elapsed advances.
+// ---------------------------------------------------------------------------
+test('running glyph cycles spinner frames across ticks (elapsed-derived)', () => {
+  const base = { status: 'pending', tag: 'shell', arg: 'ls', attrs: { command: 'ls' }, category: 'shell' };
+  // Frame is floor(elapsedMs / 100) % frames.length. Sample one full cycle.
+  for (let i = 0; i < TOOL_FRAMES.length; i++) {
+    const line = formatToolLine({ ...base, durationMs: i * 100 });
+    assert.strictEqual(firstGlyph(line), TOOL_FRAMES[i], `frame ${i} glyph`);
+  }
+  // Wraps around (modulo frames.length).
+  const wrapped = formatToolLine({ ...base, durationMs: TOOL_FRAMES.length * 100 });
+  assert.strictEqual(firstGlyph(wrapped), TOOL_FRAMES[0], 'frame index wraps');
+});
+test('the glyph actually changes between consecutive ~100ms samples', () => {
+  const base = { status: 'pending', tag: 'shell', arg: 'ls', attrs: { command: 'ls' }, category: 'shell' };
+  const g0 = firstGlyph(formatToolLine({ ...base, durationMs: 0 }));
+  const g1 = firstGlyph(formatToolLine({ ...base, durationMs: 100 }));
+  const g2 = firstGlyph(formatToolLine({ ...base, durationMs: 200 }));
+  assert.notStrictEqual(g0, g1, 'glyph advances 0 → 100ms');
+  assert.notStrictEqual(g1, g2, 'glyph advances 100 → 200ms');
+});
+// ---------------------------------------------------------------------------
+// The elapsed meter advances with the duration (the frozen-timer fix).
+// ---------------------------------------------------------------------------
+test('the running elapsed meter advances as durationMs grows', () => {
+  const base = { status: 'pending', tag: 'shell', arg: 'ls', attrs: { command: 'ls' }, category: 'shell' };
+  const at1 = plain(formatToolLine({ ...base, durationMs: 1200 }));
+  const at3 = plain(formatToolLine({ ...base, durationMs: 3400 }));
+  assert.ok(at1.includes('1.2s'), `elapsed shows 1.2s: ${at1}`);
+  assert.ok(at3.includes('3.4s'), `elapsed shows 3.4s: ${at3}`);
+  // Pending lines trail the duration with an ellipsis.
+  assert.ok(at1.includes('1.2s…'), 'pending duration trails with …');
+});
+// ---------------------------------------------------------------------------
+// Blocking tools (ask_user, rendered noDuration) keep the static dot — a
+// ticking spinner would falsely imply work is happening.
+// ---------------------------------------------------------------------------
+test('blocking (noDuration) pending tools keep the static dot, not a spinner', () => {
+  const line = formatToolLine({
+    status: 'pending', tag: 'ask_user', arg: 'Pick one', attrs: { question: 'Pick one' },
+    category: 'tool', noDuration: true,
+  });
+  assert.strictEqual(firstGlyph(line), '●', 'frozen blocking glyph stays the pending dot');
+  assert.ok(!TOOL_FRAMES.includes(firstGlyph(line)), 'not a spinner frame');
+});
+// ---------------------------------------------------------------------------
+// The descriptor path (renderOperation, pending phase) animates the same way.
+// ---------------------------------------------------------------------------
+test('renderOperation pending phase animates the glyph via durationMs', () => {
+  const mk = (durationMs) => renderOperation(
+    {
+      status: 'pending', tag: 'read_file', target: 'a.txt', attrs: { path: 'a.txt' },
+      category: 'file', durationMs,
+    },
+    { mode: 'ansi', phase: 'pending' },
+  );
+  const g0 = firstGlyph(mk(0));
+  const g1 = firstGlyph(mk(100));
+  assert.strictEqual(g0, TOOL_FRAMES[0], 'descriptor pending → spinner frame 0 at 0ms');
+  assert.notStrictEqual(g0, g1, 'descriptor pending glyph advances with elapsed');
+});
+// ---------------------------------------------------------------------------
+// Single physical row: the animated running line carries no newline (Phase 4
+// owns wrap-aware multi-row; this phase must not introduce multi-row content).
+// ---------------------------------------------------------------------------
+test('the animated running row is a single physical line (no newline)', () => {
+  const line = formatToolLine({
+    status: 'pending', tag: 'shell', arg: 'ls', attrs: { command: 'ls' },
+    category: 'shell', durationMs: 500,
+  });
+  assert.ok(!line.includes('\n'), 'no newline in the running row');
+});