npm - @semalt-ai/code - Versions diffs - 1.8.5 → 1.20.0 - Mend

@semalt-ai/code 1.8.5 → 1.20.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (192) hide show

package/.claude/settings.local.json +7 -1
package/.github/workflows/ci.yml +69 -0
package/ARCHITECTURE.md +6 -95
package/CLAUDE.md +196 -316
package/README.md +148 -4
package/docs/ARCHITECTURE.md +1321 -0
package/docs/CONFIG.md +340 -0
package/docs/HISTORY.md +245 -0
package/examples/embed.js +74 -0
package/index.js +251 -10
package/lib/agent.js +856 -120
package/lib/api.js +239 -50
package/lib/args.js +74 -2
package/lib/audit.js +23 -1
package/lib/background.js +584 -0
package/lib/checkpoints.js +757 -0
package/lib/commands/auth.js +94 -0
package/lib/commands/chat-session.js +489 -0
package/lib/commands/chat-slash.js +415 -0
package/lib/commands/chat-turn.js +669 -0
package/lib/commands/chat.js +407 -0
package/lib/commands/custom.js +157 -0
package/lib/commands/history-utils.js +66 -0
package/lib/commands/index.js +268 -0
package/lib/commands/mcp.js +113 -0
package/lib/commands/oneshot.js +193 -0
package/lib/commands/registry.js +269 -0
package/lib/commands/tasks.js +89 -0
package/lib/compact.js +87 -0
package/lib/config.js +360 -11
package/lib/constants.js +401 -3
package/lib/deny.js +199 -0
package/lib/doctor.js +160 -0
package/lib/headless.js +202 -0
package/lib/hooks.js +286 -0
package/lib/images.js +270 -0
package/lib/internals.js +49 -0
package/lib/mcp/boundary.js +131 -0
package/lib/mcp/client.js +270 -0
package/lib/mcp/oauth.js +134 -0
package/lib/memory.js +209 -0
package/lib/metrics.js +37 -2
package/lib/payload.js +54 -0
package/lib/permission-rules.js +401 -0
package/lib/permissions.js +123 -26
package/lib/pricing.js +67 -0
package/lib/proc.js +62 -0
package/lib/prompts.js +99 -8
package/lib/sandbox.js +568 -0
package/lib/sdk.js +328 -0
package/lib/secrets.js +211 -0
package/lib/skills.js +223 -0
package/lib/subagents.js +516 -0
package/lib/tool_registry.js +2862 -0
package/lib/tool_specs.js +263 -9
package/lib/tools.js +352 -1039
package/lib/ui/anim.js +86 -0
package/lib/ui/ansi.js +17 -27
package/lib/ui/chat-history.js +253 -71
package/lib/ui/create-ui.js +67 -24
package/lib/ui/diff.js +90 -25
package/lib/ui/file-activity.js +236 -0
package/lib/ui/format.js +195 -29
package/lib/ui/input-field.js +21 -11
package/lib/ui/md-stream.js +234 -0
package/lib/ui/render-operation.js +113 -0
package/lib/ui/select.js +1 -4
package/lib/ui/status-bar.js +146 -36
package/lib/ui/stream.js +20 -13
package/lib/ui/theme.js +190 -44
package/lib/ui/tool-operation.js +190 -0
package/lib/ui/utils.js +9 -5
package/lib/ui/web-activity.js +270 -0
package/lib/ui/writer.js +159 -45
package/lib/ui.js +1 -1
package/lib/verify.js +229 -0
package/lib/web-extract.js +213 -0
package/lib/web-summarize.js +68 -0
package/package.json +19 -4
package/scripts/lint.js +57 -0
package/test/agent-loop.test.js +389 -0
package/test/anim-driver.test.js +153 -0
package/test/ask-user-display.test.js +226 -0
package/test/ask-user-gate.test.js +231 -0
package/test/background.test.js +414 -0
package/test/chat-history-nocolor.test.js +155 -0
package/test/chat-relogin.test.js +207 -0
package/test/chat.test.js +114 -0
package/test/checkpoints-agent.test.js +181 -0
package/test/checkpoints.test.js +650 -0
package/test/command-registry.test.js +160 -0
package/test/compact.test.js +116 -0
package/test/completion-lazy.test.js +52 -0
package/test/config-merge.test.js +324 -0
package/test/config-quarantine.test.js +128 -0
package/test/config-write-guard-allow-anywhere.test.js +56 -0
package/test/config-write-guard-skip.test.js +46 -0
package/test/config-write-guard.test.js +153 -0
package/test/context-split.test.js +215 -0
package/test/cost-doctor.test.js +142 -0
package/test/custom-commands-chat.test.js +106 -0
package/test/custom-commands.test.js +230 -0
package/test/defer-detail-band.test.js +403 -0
package/test/deny-windows.test.js +120 -0
package/test/deny.test.js +83 -0
package/test/detail-band-tab-flatten.test.js +242 -0
package/test/download-allow-anywhere.test.js +66 -0
package/test/download-confine.test.js +153 -0
package/test/exec-diff.test.js +268 -0
package/test/executors.test.js +599 -0
package/test/extract-tool-calls.test.js +349 -0
package/test/fetch-url-validation.test.js +219 -0
package/test/file-activity.test.js +522 -0
package/test/fixtures/tool-calls.js +57 -0
package/test/fixtures/web-page.js +91 -0
package/test/git-tools.test.js +384 -0
package/test/grep-glob-serialize.test.js +242 -0
package/test/grep-glob.test.js +268 -0
package/test/grep-path-target.test.js +227 -0
package/test/harness/README.md +57 -0
package/test/harness/chat-harness.js +143 -0
package/test/harness/memwarn-headless-child.js +65 -0
package/test/harness/mock-llm.js +120 -0
package/test/harness/mock-mcp-server.js +142 -0
package/test/harness/sse-server.js +69 -0
package/test/headless.test.js +348 -0
package/test/history-utils.test.js +88 -0
package/test/hooks-agent.test.js +238 -0
package/test/hooks-verify-sandbox.test.js +232 -0
package/test/hooks.test.js +216 -0
package/test/http-get-user-agent.test.js +142 -0
package/test/images-api.test.js +208 -0
package/test/images.test.js +238 -0
package/test/input-field-ctrl-o.test.js +37 -0
package/test/live-height-physical.test.js +281 -0
package/test/max-iterations.test.js +218 -0
package/test/mcp-boundary.test.js +57 -0
package/test/mcp-client.test.js +267 -0
package/test/mcp-oauth.test.js +86 -0
package/test/md-stream.test.js +183 -0
package/test/memory-truncation-warning.test.js +222 -0
package/test/memory.test.js +198 -0
package/test/native-dispatch.test.js +409 -0
package/test/native-live-narration.test.js +254 -0
package/test/output-chokepoint.test.js +188 -0
package/test/output-heredoc-leak.test.js +195 -0
package/test/output-preview.test.js +245 -0
package/test/path-guards.test.js +134 -0
package/test/payload.test.js +99 -0
package/test/permission-rules-agent.test.js +210 -0
package/test/permission-rules.test.js +297 -0
package/test/permissions.test.js +362 -0
package/test/plan-mode.test.js +167 -0
package/test/read-paginate.test.js +275 -0
package/test/readonly-tools.test.js +177 -0
package/test/render-operation.test.js +317 -0
package/test/replay-descriptor-xml.test.js +216 -0
package/test/replay-descriptor.test.js +189 -0
package/test/replay-web-aggregate.test.js +291 -0
package/test/replay-web-persist.test.js +241 -0
package/test/result-cap.test.js +233 -0
package/test/running-glyph-anim.test.js +111 -0
package/test/sandbox-agent.test.js +147 -0
package/test/sandbox-integration.test.js +216 -0
package/test/sandbox.test.js +408 -0
package/test/sdk.test.js +234 -0
package/test/shell-output-cap.test.js +181 -0
package/test/skills-chat.test.js +110 -0
package/test/skills.test.js +295 -0
package/test/smoke.test.js +68 -0
package/test/status-bar-driver.test.js +93 -0
package/test/status-bar-pause.test.js +164 -0
package/test/status-bar-resync.test.js +188 -0
package/test/stream-parser.test.js +171 -0
package/test/subagents-agent.test.js +178 -0
package/test/subagents.test.js +222 -0
package/test/theme-palette.test.js +166 -0
package/test/tool-registry.test.js +85 -0
package/test/trim-budget.test.js +101 -0
package/test/truncate-visible.test.js +78 -0
package/test/verify-agent.test.js +317 -0
package/test/verify.test.js +141 -0
package/test/view-image.test.js +199 -0
package/test/web-activity-ordering.test.js +203 -0
package/test/web-activity.test.js +207 -0
package/test/web-data-extraction-guidance.test.js +71 -0
package/test/web-extract.test.js +185 -0
package/test/web-fetch-agent.test.js +291 -0
package/test/web-fetch-mode.test.js +193 -0
package/test/web-search.test.js +380 -0
package/lib/commands.js +0 -1438
package/path +0 -1

package/test/native-dispatch.test.js ADDED Viewed

@@ -0,0 +1,409 @@
+'use strict';
+// Native-path dispatch tests (Pre-Task 4.0c). Closes the coverage-shape blind
+// spot the re-audit found: end-to-end dispatch through the REAL runAgentLoop was
+// proven for read-only tools on BOTH rails (Task 3.3b), but EFFECTFUL tools were
+// only ever exercised through the XML path. The native function-calling path has
+// distinct glue — mapInvokeToCall → descriptor gate → role:'tool' result rooting
+// (lib/agent.js ~1378) — and Phase 4's per-pattern permissions + checkpoints both
+// hook the MUTATING dispatch path. Layering that onto unverified native glue would
+// repeat the 3.3b mistake, so these tests lock the native path end-to-end for:
+//   * file-mutating tools (write, edit, delete, move),
+//   * shell/exec,
+//   * plan-mode withhold + approve,
+// each asserting the mutation actually happens, the permission gate fires (with
+// the right descriptor), and the result is rooted as a role:'tool' message on the
+// originating tool_call_id. Where useful each native case is paired with its XML
+// equivalent IN THE SAME TEST so the two rails are proven equivalent at the loop
+// level for effectful tools (extending the 3.3b read-only equivalence proof).
+//
+// Driven via mock.replyWithToolCall(name, args) — a native tool_calls response
+// with EMPTY text content — against a temp $cwd so isPathSafe (CWD-confined)
+// permits the writes. skipPermissions auto-approves so the loop runs unattended;
+// pm.askPermission is wrapped to RECORD each gate consultation so we can assert
+// the descriptor fired for mutating tools and did NOT for read-only ones.
+const { test, before, after } = require('node:test');
+const assert = require('node:assert');
+const os = require('node:os');
+const fs = require('node:fs');
+const path = require('node:path');
+const ui = require('../lib/ui');
+const { createApiClient } = require('../lib/api');
+const { createToolExecutor, extractToolCalls } = require('../lib/tools');
+const { createPermissionManager } = require('../lib/permissions');
+const { createAgentRunner } = require('../lib/agent');
+const { startMockLLM } = require('./harness/mock-llm');
+let prevKey;
+let CWD;
+let PREV_CWD;
+before(() => {
+  prevKey = process.env.SEMALT_API_KEY; process.env.SEMALT_API_KEY = 'test-key';
+  PREV_CWD = process.cwd();
+  CWD = fs.realpathSync(fs.mkdtempSync(path.join(os.tmpdir(), 'semalt-native-')));
+  process.chdir(CWD);
+});
+after(() => {
+  process.chdir(PREV_CWD);
+  try { fs.rmSync(CWD, { recursive: true, force: true }); } catch {}
+  if (prevKey === undefined) delete process.env.SEMALT_API_KEY; else process.env.SEMALT_API_KEY = prevKey;
+});
+// A real runner whose chatStream points at `base`. skipPermissions auto-approves
+// so the loop runs unattended; `asks` records every permission-gate consultation
+// (actionType + tag) so a test can prove the descriptor fired (mutating) or did
+// not (read-only). Returns `getSaved` for the rare persistence assertion.
+function buildRunner(base) {
+  const config = {
+    api_base: base, api_key: 'test-key', default_model: 'test-model',
+    temperature: 0.5, request_timeout_ms: 5000, stream: true, models: [],
+    // Dispatch test, not a sandbox test — run real `echo` unsandboxed regardless
+    // of the runner's bwrap/sandbox-exec availability (Task 4.4).
+    sandbox: { mode: 'off' },
+  };
+  let saved = null;
+  const getConfig = () => config;
+  const saveConfig = (c) => { saved = { ...c }; Object.assign(config, c); };
+  const api = createApiClient({ getConfig, saveConfig, ui });
+  const pm = createPermissionManager(ui, { skipPermissions: true });
+  pm.setUICallbacks({ onAddMessage: () => {}, onShowModal: () => {}, onCloseModal: () => {}, onCaptureNavigation: () => () => {} });
+  const asks = [];
+  const realAsk = pm.askPermission;
+  pm.askPermission = async (actionType, description, tag) => {
+    asks.push({ actionType, tag });
+    return realAsk(actionType, description, tag);
+  };
+  const { agentExecShell, agentExecFile, describePermission } = createToolExecutor(pm, ui, getConfig);
+  const runner = createAgentRunner({
+    chatStream: api.chatStream, extractToolCalls, agentExecShell, agentExecFile,
+    describePermission, permissionManager: pm, ui, getConfig,
+  });
+  return { runner, asks, getSaved: () => saved };
+}
+function collector(extra = {}) {
+  const ev = { tokens: [], tools: [], errors: [], retries: [], assistants: [], withheld: [] };
+  const cb = {
+    onToken: (t) => ev.tokens.push(t),
+    onToolStart: () => {},
+    onToolEnd: (tag, result) => ev.tools.push({ tag, result }),
+    onError: (e) => ev.errors.push(e),
+    onRetry: (next, max) => ev.retries.push({ next, max }),
+    onAssistantMessage: (m) => ev.assistants.push(m),
+    onPlanWithhold: (tag, arg, desc) => ev.withheld.push({ tag, arg, desc }),
+    ...extra,
+  };
+  return { ev, cb };
+}
+// The assistant turn that carried structured tool_calls (native shape), plus its
+// rooted role:'tool' result. Shared assertion: a native tool turn records an
+// assistant message with EMPTY text + tool_calls, and the result comes back on a
+// role:'tool' message keyed to the originating tool_call id (lib/agent.js ~1378).
+function assertNativeRooting(messages, fnName) {
+  const assistantWithCall = messages.find((m) => m.role === 'assistant' && Array.isArray(m.tool_calls));
+  assert.ok(assistantWithCall, 'assistant message recorded the native tool_calls');
+  assert.strictEqual(assistantWithCall.content, '', 'native tool-call turn has empty text content');
+  assert.strictEqual(assistantWithCall.tool_calls[0].function.name, fnName, `tool_calls names ${fnName}`);
+  const toolMsg = messages.find((m) => m.role === 'tool');
+  assert.ok(toolMsg, 'native path appends a role:"tool" result message');
+  assert.strictEqual(toolMsg.tool_call_id, assistantWithCall.tool_calls[0].id, 'result rooted to its tool_call id');
+  assert.ok(!messages.some((m) => m.role === 'user' && /Tool execution results/.test(m.content)),
+    'native path does NOT use the XML "Tool execution results" user message');
+  return toolMsg;
+}
+// ---------------------------------------------------------------------------
+// 1. Native file-mutating: write — mutation + gate + role:'tool' rooting,
+//    paired with the XML equivalent for loop-level equivalence.
+// ---------------------------------------------------------------------------
+test('native write_file: mutates, gate fires (file/write_file), result rooted as role:"tool"', async () => {
+  const mock = await startMockLLM();
+  mock.replyWithToolCall('write_file', { path: 'native-write.txt', content: 'NATIVE_WRITE_CONTENT' });
+  mock.replyWith('Wrote it.');
+  try {
+    const { runner, asks } = buildRunner(mock.base);
+    const { ev, cb } = collector();
+    const messages = [{ role: 'user', content: 'write the file' }];
+    const { metrics } = await runner.runAgentLoop(messages, 'test-model', 5, null, { callbacks: cb });
+    // The mutation actually happened.
+    assert.strictEqual(fs.readFileSync(path.join(CWD, 'native-write.txt'), 'utf8'), 'NATIVE_WRITE_CONTENT');
+    // The permission gate fired with the mutating descriptor (NOT auto-skipped
+    // as a read-only tool).
+    assert.deepStrictEqual(asks, [{ actionType: 'file', tag: 'write_file' }], 'write gate consulted once');
+    // The tool dispatched and the result is rooted on the tool_call id.
+    assert.strictEqual(ev.tools.length, 1);
+    assert.strictEqual(ev.tools[0].tag, 'write');
+    const toolMsg = assertNativeRooting(messages, 'write_file');
+    assert.match(toolMsg.content, /Wrote \d+ bytes to native-write\.txt/);
+    assert.strictEqual(metrics.turns.length, 2, 'tool turn + final turn');
+    assert.ok(messages.some((m) => m.role === 'assistant' && m.content === 'Wrote it.'));
+    assert.strictEqual(mock.pending(), 0);
+  } finally {
+    await mock.close();
+  }
+});
+test('XML write_file equivalent: same tool, same mutation, XML "Tool execution results" shape', async () => {
+  const mock = await startMockLLM();
+  mock.replyWith('<write_file path="xml-write.txt">XML_WRITE_CONTENT</write_file>');
+  mock.replyWith('Wrote it.');
+  try {
+    const { runner, asks } = buildRunner(mock.base);
+    const { ev, cb } = collector();
+    const messages = [{ role: 'user', content: 'write the file' }];
+    await runner.runAgentLoop(messages, 'test-model', 5, null, { callbacks: cb });
+    assert.strictEqual(fs.readFileSync(path.join(CWD, 'xml-write.txt'), 'utf8'), 'XML_WRITE_CONTENT');
+    assert.deepStrictEqual(asks, [{ actionType: 'file', tag: 'write_file' }], 'same gate fires on the XML rail');
+    assert.strictEqual(ev.tools[0].tag, 'write');
+    // XML results come back as a role:'user' message, never role:'tool'.
+    const toolResult = messages.find((m) => m.role === 'user' && /Tool execution results/.test(m.content));
+    assert.ok(toolResult && /Wrote \d+ bytes to xml-write\.txt/.test(toolResult.content));
+    assert.ok(!messages.some((m) => m.role === 'tool'), 'XML path does not use role:"tool" messages');
+  } finally {
+    await mock.close();
+  }
+});
+// ---------------------------------------------------------------------------
+// 2. Native file-mutating: edit_file (line replacement) — multi-arg native call.
+// ---------------------------------------------------------------------------
+test('native edit_file: replaces the target line, gate fires, result rooted', async () => {
+  fs.writeFileSync(path.join(CWD, 'native-edit.txt'), 'line1\nline2\nline3\n');
+  const mock = await startMockLLM();
+  // fromParams: { path, line, content } → ['edit_file', path, parseInt(line), content]
+  mock.replyWithToolCall('edit_file', { path: 'native-edit.txt', line: 2, content: 'EDITED_LINE_2' });
+  mock.replyWith('Edited.');
+  try {
+    const { runner, asks } = buildRunner(mock.base);
+    const { ev, cb } = collector();
+    const messages = [{ role: 'user', content: 'edit line 2' }];
+    await runner.runAgentLoop(messages, 'test-model', 5, null, { callbacks: cb });
+    assert.strictEqual(fs.readFileSync(path.join(CWD, 'native-edit.txt'), 'utf8'), 'line1\nEDITED_LINE_2\nline3\n');
+    assert.deepStrictEqual(asks, [{ actionType: 'file', tag: 'edit_file' }]);
+    assert.strictEqual(ev.tools[0].tag, 'edit_file');
+    assertNativeRooting(messages, 'edit_file');
+  } finally {
+    await mock.close();
+  }
+});
+// ---------------------------------------------------------------------------
+// 3. Native file-mutating: delete_file.
+// ---------------------------------------------------------------------------
+test('native delete_file: removes the file, gate fires, result rooted', async () => {
+  const target = path.join(CWD, 'native-delete.txt');
+  fs.writeFileSync(target, 'doomed');
+  const mock = await startMockLLM();
+  mock.replyWithToolCall('delete_file', { path: 'native-delete.txt' });
+  mock.replyWith('Deleted.');
+  try {
+    const { runner, asks } = buildRunner(mock.base);
+    const { ev, cb } = collector();
+    const messages = [{ role: 'user', content: 'delete it' }];
+    await runner.runAgentLoop(messages, 'test-model', 5, null, { callbacks: cb });
+    assert.ok(!fs.existsSync(target), 'the file was deleted');
+    assert.deepStrictEqual(asks, [{ actionType: 'file', tag: 'delete_file' }]);
+    assert.strictEqual(ev.tools[0].tag, 'delete_file');
+    const toolMsg = assertNativeRooting(messages, 'delete_file');
+    assert.match(toolMsg.content, /Deleted native-delete\.txt/);
+  } finally {
+    await mock.close();
+  }
+});
+// ---------------------------------------------------------------------------
+// 4. Native file-mutating: move_file (multi-arg src/dst).
+// ---------------------------------------------------------------------------
+test('native move_file: renames src→dst, gate fires, result rooted', async () => {
+  fs.writeFileSync(path.join(CWD, 'native-src.txt'), 'movable');
+  const mock = await startMockLLM();
+  mock.replyWithToolCall('move_file', { src: 'native-src.txt', dst: 'native-dst.txt' });
+  mock.replyWith('Moved.');
+  try {
+    const { runner, asks } = buildRunner(mock.base);
+    const { ev, cb } = collector();
+    const messages = [{ role: 'user', content: 'move it' }];
+    await runner.runAgentLoop(messages, 'test-model', 5, null, { callbacks: cb });
+    assert.ok(!fs.existsSync(path.join(CWD, 'native-src.txt')), 'source gone');
+    assert.strictEqual(fs.readFileSync(path.join(CWD, 'native-dst.txt'), 'utf8'), 'movable', 'dst has the content');
+    assert.deepStrictEqual(asks, [{ actionType: 'file', tag: 'move_file' }]);
+    assert.strictEqual(ev.tools[0].tag, 'move_file');
+    assertNativeRooting(messages, 'move_file');
+  } finally {
+    await mock.close();
+  }
+});
+// ---------------------------------------------------------------------------
+// 5. Native shell/exec — paired with the XML <exec> equivalent.
+// ---------------------------------------------------------------------------
+test('native shell: dispatches, gate fires (shell/exec), result rooted as role:"tool"', async () => {
+  const mock = await startMockLLM();
+  mock.replyWithToolCall('shell', { command: 'echo NATIVE_SHELL_OUT' });
+  mock.replyWith('Ran it.');
+  try {
+    const { runner, asks } = buildRunner(mock.base);
+    const { ev, cb } = collector();
+    const messages = [{ role: 'user', content: 'run echo' }];
+    const { metrics } = await runner.runAgentLoop(messages, 'test-model', 5, null, { callbacks: cb });
+    assert.deepStrictEqual(asks, [{ actionType: 'shell', tag: 'exec' }], 'shell gate consulted with exec tag');
+    assert.strictEqual(ev.tools.length, 1);
+    assert.strictEqual(ev.tools[0].tag, 'shell');
+    const toolMsg = assertNativeRooting(messages, 'shell');
+    assert.match(toolMsg.content, /NATIVE_SHELL_OUT/, 'command stdout flowed back');
+    assert.match(toolMsg.content, /Exit code: 0/);
+    assert.strictEqual(metrics.turns.length, 2);
+    assert.ok(messages.some((m) => m.role === 'assistant' && m.content === 'Ran it.'));
+  } finally {
+    await mock.close();
+  }
+});
+test('XML shell equivalent: same dispatch, XML "Tool execution results" shape', async () => {
+  const mock = await startMockLLM();
+  mock.replyWith('<exec>echo XML_SHELL_OUT</exec>');
+  mock.replyWith('Ran it.');
+  try {
+    const { runner, asks } = buildRunner(mock.base);
+    const { ev, cb } = collector();
+    const messages = [{ role: 'user', content: 'run echo' }];
+    await runner.runAgentLoop(messages, 'test-model', 5, null, { callbacks: cb });
+    assert.deepStrictEqual(asks, [{ actionType: 'shell', tag: 'exec' }], 'same gate fires on the XML rail');
+    assert.strictEqual(ev.tools[0].tag, 'shell');
+    const toolResult = messages.find((m) => m.role === 'user' && /Tool execution results/.test(m.content));
+    assert.ok(toolResult && /XML_SHELL_OUT/.test(toolResult.content) && /Exit code: 0/.test(toolResult.content));
+    assert.ok(!messages.some((m) => m.role === 'tool'), 'XML path does not use role:"tool" messages');
+  } finally {
+    await mock.close();
+  }
+});
+// ---------------------------------------------------------------------------
+// 6. Native plan-mode: a mutating tool arriving via the native path is WITHHELD,
+//    and approval (plan mode off) lets it proceed — mirrors the XML plan test,
+//    additionally proving the withheld result is rooted as role:'tool' (native).
+// ---------------------------------------------------------------------------
+test('native plan mode: withholds the native mutating tool (no mutation), result rooted as role:"tool"', async () => {
+  const target = path.join(CWD, 'native-planned.txt');
+  const mock = await startMockLLM();
+  mock.replyWithToolCall('write_file', { path: 'native-planned.txt', content: 'SHOULD_NOT_WRITE' });
+  mock.replyWith('Here is my plan.');
+  try {
+    const { runner, asks } = buildRunner(mock.base);
+    const { ev, cb } = collector();
+    const messages = [{ role: 'user', content: 'change the file' }];
+    const res = await runner.runAgentLoop(messages, 'test-model', 10, null, { callbacks: cb, planMode: true });
+    assert.ok(!fs.existsSync(target), 'the file was NOT written in plan mode');
+    assert.strictEqual(res.withheldActions.length, 1, 'one action withheld');
+    assert.strictEqual(res.withheldActions[0].tag, 'write');
+    assert.deepStrictEqual(ev.withheld.map((w) => w.tag), ['write'], 'onPlanWithhold fired for the native call');
+    // Plan-mode withholding happens BEFORE the permission gate — never consulted.
+    assert.deepStrictEqual(asks, [], 'no permission prompt for a withheld tool');
+    // The withheld notice is still rooted on the native tool_call id (the loop
+    // pushes role:'tool' for native calls — lib/agent.js ~1366), keeping the
+    // assistant tool_calls ↔ tool-result map consistent for the next turn.
+    const toolMsg = assertNativeRooting(messages, 'write_file');
+    assert.match(toolMsg.content, /\[plan mode\] Withheld pending approval/);
+    assert.ok(messages.some((m) => m.role === 'assistant' && m.content === 'Here is my plan.'), 'plan recorded');
+  } finally {
+    await mock.close();
+  }
+});
+test('native plan mode OFF (approval): the same native mutating tool executes', async () => {
+  const target = path.join(CWD, 'native-approved.txt');
+  const mock = await startMockLLM();
+  mock.replyWithToolCall('write_file', { path: 'native-approved.txt', content: 'APPROVED' });
+  mock.replyWith('Done.');
+  try {
+    const { runner, asks } = buildRunner(mock.base);
+    const { ev, cb } = collector();
+    const messages = [{ role: 'user', content: 'write it' }];
+    const res = await runner.runAgentLoop(messages, 'test-model', 10, null, { callbacks: cb, planMode: false });
+    assert.strictEqual(fs.readFileSync(target, 'utf8'), 'APPROVED', 'the file was written after approval');
+    assert.strictEqual(res.withheldActions.length, 0, 'nothing withheld with plan mode off');
+    assert.deepStrictEqual(asks, [{ actionType: 'file', tag: 'write_file' }], 'gate fired on the executing path');
+    assert.strictEqual(ev.tools[0].tag, 'write');
+    assertNativeRooting(messages, 'write_file');
+  } finally {
+    await mock.close();
+  }
+});
+// ---------------------------------------------------------------------------
+// 7. P1 — native rail does NOT run textual command heuristics. A finish=stop
+//    turn whose prose contains an illustrative ```bash block (no tool tag) must
+//    yield ZERO tool calls (the incident: a hung `su nobody` + placeholder
+//    examples were executed). EXPLICIT tool tags still dispatch on the native
+//    rail. (test-model has no profile → isNativeToolsActive defaults true, the
+//    same native-rail assumption the structured tests above rely on.)
+// ---------------------------------------------------------------------------
+test('P1 native rail: illustrative ```bash block in a final answer is NOT executed (heuristic skipped)', async () => {
+  const mock = await startMockLLM();
+  // Single stop-turn: prose with a fenced example, no tool tag. On the native
+  // rail the bare-fence heuristic is suppressed → zero tool calls → final answer.
+  mock.replyWith('To drop privileges you could run:\n```bash\nsu nobody\necho "$SECRET_TOKEN"\n```\nBut do not run that here.');
+  try {
+    const { runner, asks } = buildRunner(mock.base);
+    const { ev, cb } = collector();
+    const messages = [{ role: 'user', content: 'how do I drop privileges?' }];
+    const { metrics } = await runner.runAgentLoop(messages, 'test-model', 5, null, { callbacks: cb });
+    assert.strictEqual(ev.tools.length, 0, 'NO shell command extracted/executed from the illustrative fence');
+    assert.deepStrictEqual(asks, [], 'no permission gate consulted — nothing dispatched');
+    assert.strictEqual(metrics.turns.length, 1, 'single turn — treated as a completed text answer');
+    assert.ok(messages.some((m) => m.role === 'assistant' && /su nobody/.test(m.content)),
+      'the prose (including the example) is recorded as the final answer, not run');
+    assert.strictEqual(mock.pending(), 0, 'loop ended on the first reply (no tool round-trip)');
+  } finally {
+    await mock.close();
+  }
+});
+test('P1 native rail: an EXPLICIT <shell> tag in content STILL dispatches and executes', async () => {
+  const mock = await startMockLLM();
+  mock.replyWith('<shell>echo NATIVE_XML_TAG_OUT</shell>');
+  mock.replyWith('Ran it.');
+  try {
+    const { runner, asks } = buildRunner(mock.base);
+    const { ev, cb } = collector();
+    const messages = [{ role: 'user', content: 'run echo' }];
+    const { metrics } = await runner.runAgentLoop(messages, 'test-model', 5, null, { callbacks: cb });
+    assert.strictEqual(ev.tools.length, 1, 'explicit tag dispatched on the native rail');
+    assert.strictEqual(ev.tools[0].tag, 'shell');
+    assert.deepStrictEqual(asks, [{ actionType: 'shell', tag: 'exec' }], 'shell gate fired for the explicit tag');
+    const toolResult = messages.find((m) => m.role === 'user' && /Tool execution results/.test(m.content));
+    assert.ok(toolResult && /NATIVE_XML_TAG_OUT/.test(toolResult.content), 'command stdout flowed back');
+    assert.strictEqual(metrics.turns.length, 2, 'tool turn + final turn');
+    assert.ok(messages.some((m) => m.role === 'assistant' && m.content === 'Ran it.'));
+  } finally {
+    await mock.close();
+  }
+});

package/test/native-live-narration.test.js ADDED Viewed

@@ -0,0 +1,254 @@
+'use strict';
+// Live token-by-token narration on the NATIVE tool-call rail, gated by a safety
+// signal so implicit reasoning is NEVER streamed (leaked).
+//
+// These tests drive the REAL createTurnHandler callbacks (chat-turn.js) wired to
+// the REAL ChatHistory (chat-history.js, with _commit captured), simulating the
+// per-iteration callback order agent.js + api.js produce:
+//
+//   onRequestSent()                      reset gate + safety signals
+//   onReasoningStart()                   (only when delta.reasoning_content seen)
+//   onStreamStart(nativeRail, inlineR)   first content token — rail + flag (signal b)
+//   onToken(t) …                         each delta.content token
+//   onAssistantMessage(clean,{terminal}) finalize the turn
+//
+// The gate (chat-turn.js onToken) opens eagerly ONLY when nativeRail AND
+// (reasoningSeen OR inline_reasoning:false). Otherwise it keeps the buffered-
+// until-boundary behavior (the no-leak fallback, also the entire XML-rail path).
+const { test } = require('node:test');
+const assert = require('node:assert');
+const { createTurnHandler } = require('../lib/commands/chat-turn');
+const { ChatHistory } = require('../lib/ui/chat-history');
+const { stripAnsi } = require('../lib/ui/utils');
+const { normalizeConfig } = require('../lib/config');
+// ---------------------------------------------------------------------------
+// Config: inline_reasoning (signal b) normalizes to an explicit boolean only.
+// ---------------------------------------------------------------------------
+test('inline_reasoning persists only as an explicit boolean on a models[] profile', () => {
+  const base = { api_base: 'http://x', api_key: 'k', model: 'm' };
+  const norm = (extra) => normalizeConfig({ models: [{ ...base, ...extra }] }).models[0];
+  assert.strictEqual('inline_reasoning' in norm({}), false, 'unset by default → assume might inline (safe)');
+  assert.strictEqual(norm({ inline_reasoning: false }).inline_reasoning, false, 'explicit false persists');
+  assert.strictEqual(norm({ inline_reasoning: true }).inline_reasoning, true, 'explicit true persists');
+  // Non-boolean junk is dropped (stays unset → safe default), never coerced.
+  assert.strictEqual('inline_reasoning' in norm({ inline_reasoning: 'false' }), false, 'string is dropped, not coerced');
+  assert.strictEqual('inline_reasoning' in norm({ inline_reasoning: 0 }), false, 'number is dropped, not coerced');
+});
+// Build a harness around the REAL createTurnHandler + REAL ChatHistory.
+// `streamed`  — tokens the gate passed live to chatHistory.streamToken.
+// `committed` — everything that reached scrollback (via the captured _commit).
+// `statuses`  — statusBar.update (state,label) pairs, to assert the transition.
+function harness(opts = {}) {
+  const streamed = [];
+  const committed = [];
+  const statuses = [];
+  const chatHistory = new ChatHistory();
+  chatHistory._commit = (t) => committed.push(t);
+  chatHistory._setDetail = () => {};
+  chatHistory._commitDetail = (t) => { if (t) committed.push(t); };
+  const origStream = chatHistory.streamToken.bind(chatHistory);
+  chatHistory.streamToken = (t) => { streamed.push(t); origStream(t); };
+  const statusBar = {
+    update: (state, label) => statuses.push([state, label]),
+    onToken() {}, addPendingTokens() {}, updateMetrics() {}, setCost() {},
+  };
+  const inputField = {
+    on() {}, removeListener() {}, releaseNavigation() {}, setDisabled() {},
+  };
+  const writerModule = {
+    startActivity() {}, updateActivity() {}, endActivity() {}, scrollback() {},
+  };
+  let scenario = async () => {};
+  const runAgentLoop = async (messages, model, maxIter, limit, loopOpts) => {
+    await scenario(loopOpts.callbacks);
+    return { messages, metrics: { turns: [] }, withheldActions: [] };
+  };
+  const ctx = {
+    inputField, statusBar, chatHistory, writerModule, runAgentLoop,
+    getConfig: () => ({ auth_token: 'tok', max_iterations: 50, show_cost: false, system_prompt_mode: 'system_role' }),
+    approxTokens: () => 0,
+    resolveCommand: () => null,
+    opts: { showThink: !!opts.showThink },
+    TAG_REGISTRY: {},
+    collapseListMsg() {}, handlePendingSelection() {}, showPendingStep() {},
+    activateNavCapture() {}, finalizeListMsg() {},
+    createChatIfNeeded: async () => {}, saveTurnToDashboard: async () => {}, saveSession() {},
+    messages: [], currentModel: 'm', debugMode: false, pendingImages: [],
+    chatSync: async () => '', resolvedSystemPrompt: '', resolvedTokenLimit: null, planMode: false,
+  };
+  const handler = createTurnHandler(ctx, {});
+  return { streamed, committed, statuses, handler, setScenario: (fn) => { scenario = fn; } };
+}
+// Simulate one streaming iteration in the exact callback order agent.js drives.
+function iteration(cb, { native, inlineReasoning, reasoningSeen, tokens, finalContent }) {
+  cb.onRequestSent();
+  if (reasoningSeen) cb.onReasoningStart();      // api.js wrappedOnReasoning (signal a)
+  cb.onStreamStart(native, inlineReasoning);     // first content token (rail + signal b)
+  for (const t of tokens) cb.onToken(t);
+  cb.onAssistantMessage(finalContent, { terminal: true });
+}
+function committedText(committed) { return stripAnsi(committed.join('')); }
+function countHeaders(committed) {
+  return (committedText(committed).match(/▸ AI-agent/g) || []).length;
+}
+// ---------------------------------------------------------------------------
+// Case 1 — native rail + reasoning_content seen → narration streams LIVE
+// ---------------------------------------------------------------------------
+test('native rail + reasoning_content seen: narration streams live (gate opens before finalize)', async () => {
+  const h = harness();
+  h.setScenario(async (cb) => {
+    iteration(cb, {
+      native: true, inlineReasoning: undefined, reasoningSeen: true,
+      tokens: ['Hello', ' world'], finalContent: 'Hello world',
+    });
+  });
+  await h.handler('hi');
+  assert.deepStrictEqual(h.streamed, ['Hello', ' world'], 'both narration tokens streamed live');
+  // Status bar transitioned to streaming on eager-open (not stuck on Thinking).
+  assert.ok(h.statuses.some(([s, l]) => s === 'streaming' && l === 'Streaming response'),
+    'status bar transitions to streaming on eager-open');
+});
+// ---------------------------------------------------------------------------
+// Case 2 — native rail + inline_reasoning:false → narration streams from token 1
+// ---------------------------------------------------------------------------
+test('native rail + inline_reasoning:false: narration streams live from token 1', async () => {
+  const h = harness();
+  h.setScenario(async (cb) => {
+    iteration(cb, {
+      native: true, inlineReasoning: false, reasoningSeen: false,
+      tokens: ['The', ' answer'], finalContent: 'The answer',
+    });
+  });
+  await h.handler('hi');
+  assert.deepStrictEqual(h.streamed, ['The', ' answer'], 'streams live from the first token, no reasoning_content needed');
+  assert.ok(h.statuses.some(([s, l]) => s === 'streaming' && l === 'Streaming response'),
+    'status bar transitions to streaming on eager-open');
+});
+// ---------------------------------------------------------------------------
+// Case 3 — native rail, NO signal, bare-text-then-orphan-</think>:
+//          reasoning stays HIDDEN (no leak); narration after </think> streams.
+// ---------------------------------------------------------------------------
+test('native rail + NO signal + implicit think: reasoning hidden, post-</think> narration streams', async () => {
+  const h = harness();
+  h.setScenario(async (cb) => {
+    iteration(cb, {
+      native: true, inlineReasoning: undefined, reasoningSeen: false,
+      tokens: ['Let', ' me', ' think', '</think>', 'The answer', ' is 42'],
+      finalContent: 'The answer is 42',
+    });
+  });
+  await h.handler('hi');
+  // The bare reasoning tokens were buffered+discarded — never streamed, never committed.
+  assert.deepStrictEqual(h.streamed, ['The answer', ' is 42'], 'only post-</think> narration streams');
+  const text = committedText(h.committed);
+  for (const leak of ['Let', 'me', 'think']) {
+    assert.ok(!text.includes(leak), `reasoning token ${JSON.stringify(leak)} must NOT reach scrollback (no leak)`);
+  }
+  assert.ok(text.includes('The answer is 42'), 'narration is rendered');
+});
+// ---------------------------------------------------------------------------
+// Case 4 — XML rail (nativeRail false): identical to old behavior even WITH
+//          reasoning_content seen. The eager-open guard excludes the XML rail.
+// ---------------------------------------------------------------------------
+test('XML rail: buffered-until-</think> behavior unchanged even with reasoning seen (regression guard)', async () => {
+  const h = harness();
+  h.setScenario(async (cb) => {
+    iteration(cb, {
+      native: false, inlineReasoning: false, reasoningSeen: true,
+      tokens: ['secret', ' reasoning', '</think>', 'visible answer'],
+      finalContent: 'visible answer',
+    });
+  });
+  await h.handler('hi');
+  assert.deepStrictEqual(h.streamed, ['visible answer'], 'XML rail buffers leading text despite signals');
+  const text = committedText(h.committed);
+  assert.ok(!text.includes('secret'), 'XML rail does not leak buffered reasoning');
+  assert.ok(text.includes('visible answer'), 'XML rail narration after </think> still renders');
+});
+// ---------------------------------------------------------------------------
+// Case 5 — finalize after a live stream does NOT double-print (one AI header).
+// ---------------------------------------------------------------------------
+test('finalize after live stream: no double-print (single AI bubble, no _buildAI re-synthesis)', async () => {
+  const h = harness();
+  h.setScenario(async (cb) => {
+    iteration(cb, {
+      native: true, inlineReasoning: false, reasoningSeen: false,
+      tokens: ['unique-token-α'], finalContent: 'unique-token-α',
+    });
+  });
+  await h.handler('hi');
+  const text = committedText(h.committed);
+  assert.strictEqual(countHeaders(h.committed), 1, 'exactly one ▸ AI-agent header (no re-synthesized bubble)');
+  assert.strictEqual((text.match(/unique-token-α/g) || []).length, 1, 'narration committed exactly once');
+});
+// ---------------------------------------------------------------------------
+// Case 6 (P2 regression) — native rail, reasoning_content seen (eager-open),
+// content carries a STRAY inline </think>. MiniMax emits reasoning via BOTH
+// reasoning_content AND an orphan </think> terminator in content; the
+// StreamParser passes that orphan tag through verbatim (its `/think` form is not
+// a TAG_REGISTRY key). The eager-open path must DROP it (not stream it) while the
+// surrounding narration streams live. Regression from 938f583: the eager-open
+// fell through to streamToken and skipped the orphan-drop guard for every token.
+// (f) The committed scrollback — what persists/replays — stays clean too.
+// ---------------------------------------------------------------------------
+test('native rail + reasoning_content + inline orphan </think>: stray tag dropped, narration streams (P2)', async () => {
+  const h = harness();
+  h.setScenario(async (cb) => {
+    iteration(cb, {
+      native: true, inlineReasoning: undefined, reasoningSeen: true,
+      tokens: ['Here', ' is', '</think>', ' the answer'],
+      finalContent: 'Here is the answer',
+    });
+  });
+  await h.handler('hi');
+  assert.deepStrictEqual(h.streamed, ['Here', ' is', ' the answer'],
+    'orphan </think> dropped from the live stream; surrounding narration streamed live');
+  const text = committedText(h.committed);
+  assert.ok(!text.includes('</think>'), 'the stray </think> never reaches scrollback (live + persisted clean)');
+  assert.ok(text.includes('Here is the answer'), 'narration is rendered');
+});
+// Every closing reasoning tag the StreamParser emits raw must drop on the
+// eager-open path — for the WHOLE stream, not just the first token. The set
+// matches the registered visual tags (think/reasoning/reflection/plan); their
+// `/tag` closing form is never a TAG_REGISTRY key, so all stream verbatim.
+test('native rail eager-open: orphan think/reasoning/reflection/plan close tags all dropped (P2)', async () => {
+  for (const tag of ['think', 'reasoning', 'reflection', 'plan']) {
+    const h = harness();
+    h.setScenario(async (cb) => {
+      iteration(cb, {
+        native: true, inlineReasoning: false, reasoningSeen: false,
+        tokens: ['ok ', `</${tag}>`, 'done'],
+        finalContent: 'ok done',
+      });
+    });
+    await h.handler('hi');
+    assert.deepStrictEqual(h.streamed, ['ok ', 'done'], `</${tag}> dropped on the eager-open path`);
+    assert.ok(!committedText(h.committed).includes(`</${tag}>`), `</${tag}> never committed`);
+  }
+});