npm - @semalt-ai/code - Versions diffs - 1.8.5 → 1.19.0 - Mend

@semalt-ai/code 1.8.5 → 1.19.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (146) hide show

package/.claude/settings.local.json +6 -1
package/.github/workflows/ci.yml +69 -0
package/CLAUDE.md +1584 -26
package/README.md +147 -3
package/examples/embed.js +74 -0
package/index.js +251 -10
package/lib/agent.js +711 -104
package/lib/api.js +213 -49
package/lib/args.js +74 -2
package/lib/audit.js +23 -1
package/lib/background.js +584 -0
package/lib/checkpoints.js +757 -0
package/lib/commands/auth.js +94 -0
package/lib/commands/chat-session.js +306 -0
package/lib/commands/chat-slash.js +399 -0
package/lib/commands/chat-turn.js +446 -0
package/lib/commands/chat.js +403 -0
package/lib/commands/custom.js +157 -0
package/lib/commands/history-utils.js +66 -0
package/lib/commands/index.js +268 -0
package/lib/commands/mcp.js +113 -0
package/lib/commands/oneshot.js +193 -0
package/lib/commands/registry.js +269 -0
package/lib/commands/tasks.js +89 -0
package/lib/compact.js +87 -0
package/lib/config.js +333 -11
package/lib/constants.js +372 -3
package/lib/deny.js +199 -0
package/lib/doctor.js +160 -0
package/lib/headless.js +167 -0
package/lib/hooks.js +286 -0
package/lib/images.js +264 -0
package/lib/internals.js +49 -0
package/lib/mcp/boundary.js +131 -0
package/lib/mcp/client.js +270 -0
package/lib/mcp/oauth.js +134 -0
package/lib/memory.js +209 -0
package/lib/metrics.js +37 -2
package/lib/payload.js +54 -0
package/lib/permission-rules.js +401 -0
package/lib/permissions.js +100 -10
package/lib/pricing.js +67 -0
package/lib/proc.js +62 -0
package/lib/prompts.js +84 -5
package/lib/sandbox.js +568 -0
package/lib/sdk.js +328 -0
package/lib/secrets.js +211 -0
package/lib/skills.js +223 -0
package/lib/subagents.js +516 -0
package/lib/tool_registry.js +2558 -0
package/lib/tool_specs.js +222 -2
package/lib/tools.js +272 -1020
package/lib/ui/format.js +22 -1
package/lib/ui/input-field.js +16 -7
package/lib/ui/status-bar.js +79 -11
package/lib/ui/theme.js +1 -0
package/lib/ui/web-activity.js +218 -0
package/lib/verify.js +229 -0
package/lib/web-extract.js +213 -0
package/lib/web-summarize.js +68 -0
package/package.json +19 -4
package/scripts/lint.js +57 -0
package/test/agent-loop.test.js +389 -0
package/test/background.test.js +414 -0
package/test/chat.test.js +114 -0
package/test/checkpoints-agent.test.js +181 -0
package/test/checkpoints.test.js +650 -0
package/test/command-registry.test.js +160 -0
package/test/compact.test.js +116 -0
package/test/completion-lazy.test.js +52 -0
package/test/config-merge.test.js +324 -0
package/test/config-quarantine.test.js +128 -0
package/test/config-write-guard-allow-anywhere.test.js +56 -0
package/test/config-write-guard-skip.test.js +46 -0
package/test/config-write-guard.test.js +153 -0
package/test/context-split.test.js +215 -0
package/test/cost-doctor.test.js +142 -0
package/test/custom-commands-chat.test.js +106 -0
package/test/custom-commands.test.js +230 -0
package/test/deny-windows.test.js +120 -0
package/test/deny.test.js +83 -0
package/test/download-allow-anywhere.test.js +66 -0
package/test/download-confine.test.js +153 -0
package/test/executors.test.js +362 -0
package/test/extract-tool-calls.test.js +315 -0
package/test/fetch-url-validation.test.js +219 -0
package/test/fixtures/tool-calls.js +57 -0
package/test/fixtures/web-page.js +91 -0
package/test/git-tools.test.js +384 -0
package/test/grep-glob-serialize.test.js +242 -0
package/test/grep-glob.test.js +268 -0
package/test/harness/README.md +57 -0
package/test/harness/chat-harness.js +142 -0
package/test/harness/memwarn-headless-child.js +65 -0
package/test/harness/mock-llm.js +120 -0
package/test/harness/mock-mcp-server.js +142 -0
package/test/harness/sse-server.js +69 -0
package/test/headless.test.js +203 -0
package/test/history-utils.test.js +88 -0
package/test/hooks-agent.test.js +238 -0
package/test/hooks-verify-sandbox.test.js +232 -0
package/test/hooks.test.js +216 -0
package/test/http-get-user-agent.test.js +142 -0
package/test/images-api.test.js +208 -0
package/test/images.test.js +238 -0
package/test/max-iterations.test.js +216 -0
package/test/mcp-boundary.test.js +57 -0
package/test/mcp-client.test.js +267 -0
package/test/mcp-oauth.test.js +86 -0
package/test/memory-truncation-warning.test.js +222 -0
package/test/memory.test.js +198 -0
package/test/native-dispatch.test.js +356 -0
package/test/output-chokepoint.test.js +188 -0
package/test/path-guards.test.js +134 -0
package/test/payload.test.js +99 -0
package/test/permission-rules-agent.test.js +210 -0
package/test/permission-rules.test.js +297 -0
package/test/permissions.test.js +163 -0
package/test/plan-mode.test.js +167 -0
package/test/read-paginate.test.js +275 -0
package/test/readonly-tools.test.js +177 -0
package/test/result-cap.test.js +233 -0
package/test/sandbox-agent.test.js +147 -0
package/test/sandbox-integration.test.js +216 -0
package/test/sandbox.test.js +408 -0
package/test/sdk.test.js +234 -0
package/test/shell-output-cap.test.js +181 -0
package/test/skills-chat.test.js +110 -0
package/test/skills.test.js +295 -0
package/test/smoke.test.js +68 -0
package/test/status-bar-pause.test.js +164 -0
package/test/stream-parser.test.js +147 -0
package/test/subagents-agent.test.js +178 -0
package/test/subagents.test.js +222 -0
package/test/tool-registry.test.js +85 -0
package/test/trim-budget.test.js +101 -0
package/test/verify-agent.test.js +317 -0
package/test/verify.test.js +141 -0
package/test/web-activity-ordering.test.js +194 -0
package/test/web-activity.test.js +207 -0
package/test/web-data-extraction-guidance.test.js +71 -0
package/test/web-extract.test.js +185 -0
package/test/web-fetch-agent.test.js +291 -0
package/test/web-fetch-mode.test.js +193 -0
package/test/web-search.test.js +380 -0
package/lib/commands.js +0 -1438

package/test/verify-agent.test.js ADDED Viewed

@@ -0,0 +1,317 @@
+'use strict';
+// Integration tests for self-verification (Task 4.2) driving the REAL
+// runAgentLoop against the mock-LLM harness, with the REAL createVerifyRunner
+// reading config.verify (so spawnSync actually runs the verify command). Verify
+// commands use `node -e …` so they are portable across the CI matrix.
+const { test, before, after } = require('node:test');
+const assert = require('node:assert');
+const fs = require('fs');
+const os = require('os');
+const path = require('path');
+const ui = require('../lib/ui');
+const { createApiClient } = require('../lib/api');
+const { createToolExecutor, extractToolCalls } = require('../lib/tools');
+const { createPermissionManager } = require('../lib/permissions');
+const { createAgentRunner } = require('../lib/agent');
+const { runHeadless } = require('../lib/headless');
+const { startMockLLM } = require('./harness/mock-llm');
+let prevKey;
+before(() => { prevKey = process.env.SEMALT_API_KEY; process.env.SEMALT_API_KEY = 'test-key'; });
+after(() => {
+  if (prevKey === undefined) delete process.env.SEMALT_API_KEY;
+  else process.env.SEMALT_API_KEY = prevKey;
+});
+const NODE = JSON.stringify(process.execPath);
+// buildRunner mirrors hooks-agent.test.js, but threads `verify` into config so
+// the real verify runner (built inside createAgentRunner from getConfig) sees it.
+function buildRunner(base, verify) {
+  const config = {
+    api_base: base, api_key: 'test-key', default_model: 'test-model',
+    temperature: 0.5, request_timeout_ms: 5000, stream: true, models: [],
+    verify: verify || {},
+    // This suite tests verify ORCHESTRATION, not the OS sandbox (covered by
+    // hooks-verify-sandbox.test.js). Disable the sandbox so the verify commands
+    // run deterministically across the CI matrix regardless of bwrap/Seatbelt.
+    sandbox: { mode: 'off' },
+  };
+  const getConfig = () => config;
+  const saveConfig = (c) => Object.assign(config, c);
+  const api = createApiClient({ getConfig, saveConfig, ui });
+  const pm = createPermissionManager(ui, { skipPermissions: true });
+  pm.setUICallbacks({ onAddMessage: () => {}, onShowModal: () => {}, onCloseModal: () => {}, onCaptureNavigation: () => () => {} });
+  const { agentExecShell, agentExecFile, describePermission } = createToolExecutor(pm, ui, getConfig);
+  const runner = createAgentRunner({
+    chatStream: api.chatStream, extractToolCalls, agentExecShell, agentExecFile,
+    describePermission, permissionManager: pm, ui, getConfig,
+  });
+  return { runner, config };
+}
+function collector() {
+  const ev = { errors: [], assistants: [] };
+  const cb = {
+    onError: (e) => ev.errors.push(e),
+    onAssistantMessage: (m) => ev.assistants.push(m),
+  };
+  return { ev, cb };
+}
+function lastFedVerify(messages) {
+  return [...messages].reverse().find((m) => m.role === 'user' && /\[verify/.test(m.content || ''));
+}
+function tmpdir() { return fs.mkdtempSync(path.join(os.tmpdir(), 'semalt-verify-')); }
+// ---------------------------------------------------------------------------
+// 1. Advisory: result fed into context, turn ends regardless of pass/fail
+// ---------------------------------------------------------------------------
+test('advisory verify FAILS: the result is fed into context but the turn still ends', async () => {
+  const verify = { mode: 'advisory', command: `${NODE} -e "process.stdout.write('ADVISORY_FAIL_OUT');process.exit(1)"` };
+  const mock = await startMockLLM();
+  mock.replyWith('Done.');
+  try {
+    const { runner } = buildRunner(mock.base, verify);
+    const { cb } = collector();
+    const messages = [{ role: 'user', content: 'do the task' }];
+    const res = await runner.runAgentLoop(messages, 'test-model', 10, null, { callbacks: cb });
+    assert.strictEqual(res.verifyStatus, 'failed', 'a failing advisory verify reports failed');
+    assert.strictEqual(res.stopReason, 'end_turn', 'advisory NEVER blocks — turn ends normally');
+    assert.strictEqual(res.metrics.turns.length, 1, 'no re-entry into the loop in advisory mode');
+    const fed = lastFedVerify(messages);
+    assert.ok(fed, 'the verify result is fed into context');
+    assert.match(fed.content, /ADVISORY_FAIL_OUT/, 'the command output is present');
+    assert.match(fed.content, /UNTRUSTED_EXTERNAL_CONTENT/, 'verify output is fenced as untrusted');
+  } finally {
+    await mock.close();
+  }
+});
+test('advisory verify PASSES: the result is fed into context and the turn ends', async () => {
+  const verify = { mode: 'advisory', command: `${NODE} -e "process.exit(0)"` };
+  const mock = await startMockLLM();
+  mock.replyWith('Done.');
+  try {
+    const { runner } = buildRunner(mock.base, verify);
+    const { cb } = collector();
+    const messages = [{ role: 'user', content: 'do the task' }];
+    const res = await runner.runAgentLoop(messages, 'test-model', 10, null, { callbacks: cb });
+    assert.strictEqual(res.verifyStatus, 'passed');
+    assert.strictEqual(res.stopReason, 'end_turn');
+    const fed = lastFedVerify(messages);
+    assert.ok(fed, 'a passing advisory verify is still fed into context as information');
+    assert.match(fed.content, /PASSED/);
+  } finally {
+    await mock.close();
+  }
+});
+// ---------------------------------------------------------------------------
+// 2. Enforcing pass: verify passes, the turn ends
+// ---------------------------------------------------------------------------
+test('enforcing verify PASSES on the first try: the turn ends immediately', async () => {
+  const verify = { mode: 'enforcing', command: `${NODE} -e "process.exit(0)"` };
+  const mock = await startMockLLM();
+  mock.replyWith('All done.');
+  try {
+    const { runner } = buildRunner(mock.base, verify);
+    const { cb } = collector();
+    const messages = [{ role: 'user', content: 'do it' }];
+    const res = await runner.runAgentLoop(messages, 'test-model', 10, null, { callbacks: cb });
+    assert.strictEqual(res.verifyStatus, 'passed');
+    assert.strictEqual(res.stopReason, 'end_turn');
+    assert.strictEqual(res.metrics.turns.length, 1, 'passing verify means no re-entry');
+  } finally {
+    await mock.close();
+  }
+});
+// ---------------------------------------------------------------------------
+// 3. Enforcing fail-then-pass: failure re-enters the loop, second attempt passes
+// ---------------------------------------------------------------------------
+test('enforcing verify FAILS then PASSES: the agent is returned to the loop and finishes once verified', async () => {
+  const dir = tmpdir();
+  const marker = path.join(dir, 'mark');
+  const prev = process.env.SEMALT_VERIFY_MARKER;
+  process.env.SEMALT_VERIFY_MARKER = marker;
+  // Fail when the marker is absent (creating it), pass once it exists → exactly
+  // fail-then-pass across two verify runs.
+  const verify = {
+    mode: 'enforcing',
+    command: `${NODE} -e "const fs=require('fs');const f=process.env.SEMALT_VERIFY_MARKER;if(fs.existsSync(f)){process.exit(0)}else{fs.writeFileSync(f,'x');process.exit(1)}"`,
+  };
+  const mock = await startMockLLM();
+  mock.replyWith('Done (attempt 1).');
+  mock.replyWith('Fixed it (attempt 2).');
+  try {
+    const { runner } = buildRunner(mock.base, verify);
+    const { cb } = collector();
+    const messages = [{ role: 'user', content: 'do it' }];
+    const res = await runner.runAgentLoop(messages, 'test-model', 10, null, { callbacks: cb });
+    assert.strictEqual(res.verifyStatus, 'passed', 'final verify passed');
+    assert.strictEqual(res.stopReason, 'end_turn');
+    assert.strictEqual(res.metrics.turns.length, 2, 'the failing verify re-entered the loop once');
+    // The first (failing) verify pushed a corrective, fenced message into context.
+    const reentry = messages.find((m) => m.role === 'user' && /NOT done/.test(m.content || ''));
+    assert.ok(reentry, 'a corrective re-entry message was injected on failure');
+    assert.match(reentry.content, /UNTRUSTED_EXTERNAL_CONTENT/, 'the failing result is fenced as untrusted');
+    assert.ok(messages.some((m) => m.role === 'assistant' && /attempt 2/.test(m.content)));
+  } finally {
+    await mock.close();
+    if (prev === undefined) delete process.env.SEMALT_VERIFY_MARKER;
+    else process.env.SEMALT_VERIFY_MARKER = prev;
+    fs.rmSync(dir, { recursive: true, force: true });
+  }
+});
+// ---------------------------------------------------------------------------
+// 4. Enforcing exhausts: N failures terminate with verify_failed, NOT the iteration cap
+// ---------------------------------------------------------------------------
+test('enforcing verify that never passes terminates with stopReason verify_failed after max_attempts', async () => {
+  const verify = { mode: 'enforcing', command: `${NODE} -e "process.exit(1)"`, max_attempts: 2 };
+  const mock = await startMockLLM();
+  // Queue more replies than the verify-attempt limit to prove we stop on the
+  // attempt limit, not by exhausting the (much larger) iteration cap.
+  mock.replyWith('Try 1.');
+  mock.replyWith('Try 2.');
+  mock.replyWith('Try 3.');
+  try {
+    const { runner } = buildRunner(mock.base, verify);
+    const { cb } = collector();
+    const messages = [{ role: 'user', content: 'do it' }];
+    const res = await runner.runAgentLoop(messages, 'test-model', 50, null, { callbacks: cb });
+    assert.strictEqual(res.stopReason, 'verify_failed', 'precise bound, not max_iterations');
+    assert.strictEqual(res.verifyStatus, 'failed');
+    assert.strictEqual(res.metrics.turns.length, 2, 'stopped after exactly max_attempts (2) failed verifies');
+    assert.ok(mock.pending() >= 1, 'the iteration cap was nowhere near — extra replies left unused');
+  } finally {
+    await mock.close();
+  }
+});
+// ---------------------------------------------------------------------------
+// 5. Timeout treated as a failed verify, no hang
+// ---------------------------------------------------------------------------
+test('a hung verify command times out and is treated as a failed verification (no hang)', async () => {
+  const verify = { mode: 'advisory', command: `${NODE} -e "setTimeout(function(){}, 10000)"`, timeout_ms: 300 };
+  const mock = await startMockLLM();
+  mock.replyWith('Done.');
+  try {
+    const { runner } = buildRunner(mock.base, verify);
+    const { cb } = collector();
+    const messages = [{ role: 'user', content: 'do it' }];
+    const res = await runner.runAgentLoop(messages, 'test-model', 10, null, { callbacks: cb });
+    assert.strictEqual(res.verifyStatus, 'failed', 'timeout is a failed verify');
+    assert.strictEqual(res.stopReason, 'end_turn', 'advisory still ends the turn');
+    const fed = lastFedVerify(messages);
+    assert.match(fed.content, /timed out/i);
+  } finally {
+    await mock.close();
+  }
+});
+// ---------------------------------------------------------------------------
+// 6. Deny-listed verify command is refused
+// ---------------------------------------------------------------------------
+test('a deny-listed verify command is refused (never run) and reported as a failed verify', async () => {
+  const verify = { mode: 'advisory', command: 'rm -rf /' };
+  const mock = await startMockLLM();
+  mock.replyWith('Done.');
+  try {
+    const { runner } = buildRunner(mock.base, verify);
+    const { cb } = collector();
+    const messages = [{ role: 'user', content: 'do it' }];
+    const res = await runner.runAgentLoop(messages, 'test-model', 10, null, { callbacks: cb });
+    assert.strictEqual(res.verifyStatus, 'failed');
+    const fed = lastFedVerify(messages);
+    assert.match(fed.content, /deny-list/i, 'the result explains the command was refused');
+  } finally {
+    await mock.close();
+  }
+});
+// ---------------------------------------------------------------------------
+// 7. --no-verify skips it; no command configured is a no-op
+// ---------------------------------------------------------------------------
+test('--no-verify skips an otherwise-failing enforcing verify; the turn ends as skipped', async () => {
+  const verify = { mode: 'enforcing', command: `${NODE} -e "process.exit(1)"` };
+  const mock = await startMockLLM();
+  mock.replyWith('Done.');
+  try {
+    const { runner } = buildRunner(mock.base, verify);
+    const { cb } = collector();
+    const messages = [{ role: 'user', content: 'do it' }];
+    const res = await runner.runAgentLoop(messages, 'test-model', 10, null, { callbacks: cb, noVerify: true });
+    assert.strictEqual(res.verifyStatus, 'skipped');
+    assert.strictEqual(res.stopReason, 'end_turn');
+    assert.strictEqual(res.metrics.turns.length, 1, 'no verify, no re-entry');
+    assert.ok(!lastFedVerify(messages), 'no verify result fed into context');
+  } finally {
+    await mock.close();
+  }
+});
+test('no command configured is a no-op (skipped), even in enforcing mode', async () => {
+  const verify = { mode: 'enforcing', command: '' };
+  const mock = await startMockLLM();
+  mock.replyWith('Done.');
+  try {
+    const { runner } = buildRunner(mock.base, verify);
+    const { cb } = collector();
+    const messages = [{ role: 'user', content: 'do it' }];
+    const res = await runner.runAgentLoop(messages, 'test-model', 10, null, { callbacks: cb });
+    assert.strictEqual(res.verifyStatus, 'skipped');
+    assert.strictEqual(res.stopReason, 'end_turn');
+  } finally {
+    await mock.close();
+  }
+});
+// ---------------------------------------------------------------------------
+// 8. Headless surfaces verifyStatus
+// ---------------------------------------------------------------------------
+test('headless json output surfaces verifyStatus', async () => {
+  const verify = { mode: 'advisory', command: `${NODE} -e "process.exit(0)"` };
+  const mock = await startMockLLM();
+  mock.replyWith('Done.');
+  try {
+    const { runner } = buildRunner(mock.base, verify);
+    const lines = [];
+    await runHeadless({
+      runAgentLoop: runner.runAgentLoop,
+      messages: [{ role: 'user', content: 'do it' }],
+      model: 'test-model',
+      mode: 'json',
+      maxIterations: 10,
+      write: (s) => lines.push(s),
+    });
+    const objs = lines.join('').split('\n').filter((l) => l.trim()).map((l) => JSON.parse(l));
+    assert.strictEqual(objs.length, 1);
+    assert.strictEqual(objs[0].verifyStatus, 'passed', 'verifyStatus is in the json envelope');
+    assert.strictEqual(objs[0].stopReason, 'end_turn');
+  } finally {
+    await mock.close();
+  }
+});

package/test/verify.test.js ADDED Viewed

@@ -0,0 +1,141 @@
+'use strict';
+// Unit tests for self-verification (Task 4.2) — the pure normalizer and the
+// command runner in lib/verify.js. The runner uses the REAL spawnSync via
+// portable `node -e` commands so exit-code semantics, deny-list refusal, the
+// no-op cases, and untrusted-fencing are all exercised directly (no agent loop).
+const { test } = require('node:test');
+const assert = require('node:assert');
+const { normalizeVerify, createVerifyRunner: _createVerifyRunner } = require('../lib/verify');
+const { DEFAULT_VERIFY_TIMEOUT_MS, DEFAULT_VERIFY_MAX_ATTEMPTS } = require('../lib/constants');
+const NODE = JSON.stringify(process.execPath);
+// These tests exercise verify ORCHESTRATION (deny-list, exit-code semantics,
+// timeout, no-op/skip, fencing) — NOT the OS sandbox, which has its own
+// dedicated tests (hooks-verify-sandbox.test.js). Inject a pass-through sandbox
+// resolver so the command runs plain via the 2-arg spawn(command, opts) form.
+const NO_SANDBOX = (command) => ({ run: true, useShell: true, file: command, args: [], sandbox: 'off' });
+const createVerifyRunner = (opts = {}) => _createVerifyRunner({ sandbox: NO_SANDBOX, ...opts });
+const runnerFor = (verify) => createVerifyRunner({ getConfig: () => ({ verify }) });
+// ---------------------------------------------------------------------------
+// normalizeVerify
+// ---------------------------------------------------------------------------
+test('normalizeVerify: defaults for empty/garbage input', () => {
+  for (const bad of [undefined, null, 42, 'x', [], true]) {
+    assert.deepStrictEqual(normalizeVerify(bad), {
+      mode: 'advisory',
+      command: '',
+      timeout_ms: DEFAULT_VERIFY_TIMEOUT_MS,
+      expected_exit_code: 0,
+      max_attempts: DEFAULT_VERIFY_MAX_ATTEMPTS,
+    });
+  }
+});
+test('normalizeVerify: accepts valid fields, rejects invalid ones', () => {
+  const v = normalizeVerify({
+    mode: 'enforcing', command: '  npm test  ',
+    timeout_ms: 5000, expected_exit_code: 2, max_attempts: 4,
+  });
+  assert.strictEqual(v.mode, 'enforcing');
+  assert.strictEqual(v.command, 'npm test', 'command is trimmed');
+  assert.strictEqual(v.timeout_ms, 5000);
+  assert.strictEqual(v.expected_exit_code, 2);
+  assert.strictEqual(v.max_attempts, 4);
+  // Invalid values fall back to defaults — never unbounded/negative.
+  const bad = normalizeVerify({
+    mode: 'bogus', command: '   ', timeout_ms: 0,
+    expected_exit_code: -1, max_attempts: 0,
+  });
+  assert.strictEqual(bad.mode, 'advisory', 'unknown mode → advisory');
+  assert.strictEqual(bad.command, '', 'blank command → empty (no-op)');
+  assert.strictEqual(bad.timeout_ms, DEFAULT_VERIFY_TIMEOUT_MS);
+  assert.strictEqual(bad.expected_exit_code, 0, 'negative expected exit code rejected');
+  assert.strictEqual(bad.max_attempts, DEFAULT_VERIFY_MAX_ATTEMPTS, 'zero attempts rejected');
+});
+// ---------------------------------------------------------------------------
+// runner — no-op cases
+// ---------------------------------------------------------------------------
+test('run: no command configured is a no-op (skipped)', async () => {
+  const res = await runnerFor({ command: '' }).run();
+  assert.strictEqual(res.skipped, true);
+  assert.strictEqual(res.ran, false);
+});
+test('run: --no-verify short-circuits even with a command configured', async () => {
+  const res = await runnerFor({ command: `${NODE} -e "process.exit(0)"` }).run({ noVerify: true });
+  assert.strictEqual(res.skipped, true);
+  assert.strictEqual(res.ran, false);
+});
+// ---------------------------------------------------------------------------
+// runner — exit-code based success (never stdout parsing)
+// ---------------------------------------------------------------------------
+test('run: exit 0 passes by default', async () => {
+  const res = await runnerFor({ command: `${NODE} -e "process.exit(0)"` }).run();
+  assert.strictEqual(res.passed, true);
+  assert.strictEqual(res.ran, true);
+  assert.strictEqual(res.exitCode, 0);
+  assert.match(res.output, /PASSED/);
+});
+test('run: nonzero exit fails by default', async () => {
+  const res = await runnerFor({ command: `${NODE} -e "process.exit(1)"` }).run();
+  assert.strictEqual(res.passed, false);
+  assert.strictEqual(res.exitCode, 1);
+  assert.match(res.output, /FAILED/);
+});
+test('run: a command that prints "PASS" but exits nonzero still FAILS (exit-code based, not stdout parsing)', async () => {
+  const res = await runnerFor({ command: `${NODE} -e "process.stdout.write('ALL TESTS PASS');process.exit(1)"` }).run();
+  assert.strictEqual(res.passed, false, 'stdout success words do not make a failing exit pass');
+});
+test('run: configurable expected_exit_code', async () => {
+  const res = await runnerFor({ command: `${NODE} -e "process.exit(3)"`, expected_exit_code: 3 }).run();
+  assert.strictEqual(res.passed, true, 'exit matches the expected non-zero code');
+  assert.strictEqual(res.exitCode, 3);
+});
+// ---------------------------------------------------------------------------
+// runner — deny-list, timeout, fencing
+// ---------------------------------------------------------------------------
+test('run: a deny-listed verify command is refused (never run) and reported non-passing', async () => {
+  const res = await runnerFor({ command: 'rm -rf /' }).run();
+  assert.strictEqual(res.passed, false);
+  assert.strictEqual(res.ran, false, 'the command was never executed');
+  assert.ok(res.denied, 'a deny-list label is recorded');
+  assert.match(res.output, /deny-list/i);
+});
+test('run: a hung command times out and is treated as a failed verify (no hang)', async () => {
+  const res = await runnerFor({ command: `${NODE} -e "setTimeout(function(){}, 10000)"`, timeout_ms: 300 }).run();
+  assert.strictEqual(res.timedOut, true);
+  assert.strictEqual(res.passed, false);
+  assert.match(res.output, /timed out/i);
+});
+test('run: output is fenced as untrusted external content', async () => {
+  const res = await runnerFor({ command: `${NODE} -e "process.stdout.write('SENTINEL_OUT_9');process.exit(1)"` }).run();
+  assert.match(res.fenced, /UNTRUSTED_EXTERNAL_CONTENT/, 'fenced with the standard delimiter');
+  assert.match(res.fenced, /SENTINEL_OUT_9/, 'the command output is inside the fence');
+});
+test('run: deny-list and timeout both short-circuit via an injected spawn that is never called', async () => {
+  let spawnCalls = 0;
+  const spy = () => { spawnCalls++; return { status: 0, stdout: '', stderr: '' }; };
+  const runner = createVerifyRunner({ getConfig: () => ({ verify: { command: 'rm -rf /' } }), spawn: spy });
+  const res = await runner.run();
+  assert.strictEqual(spawnCalls, 0, 'a deny-listed command never reaches spawn');
+  assert.ok(res.denied);
+});

package/test/web-activity-ordering.test.js ADDED Viewed

@@ -0,0 +1,194 @@
+'use strict';
+// Web-activity ordering (W.3 regression fix). The collapsed "✓ web · …" summary
+// must commit to scrollback BEFORE the agent's answer, not after it.
+//
+// The W.3 regression: http_get/web_search deferred their scrollback commit from
+// "tool end" to webTracker.flush(), and in a "web-op(s) → answer" turn the only
+// flush that fired was the turn-end `finally` — which runs AFTER runAgentLoop
+// returns, i.e. after the answer was already committed. The fix flushes the open
+// web group in onAssistantMessage when cleanContent is non-empty (the terminal
+// response signal), while intermediate empty-content iterations keep the group
+// open so multi-step search→fetch still collapses to one line.
+//
+// These tests drive the REAL createTurnHandler callbacks (chat-turn.js) with a
+// mock runAgentLoop that invokes them in the order agent.js does — per iteration
+// onAssistantMessage(displayReply) fires first (empty '' when the iteration
+// carried tool calls, non-empty on the final answer), then the tools execute —
+// recording an ordered event log so we can assert "summary before answer".
+const { test } = require('node:test');
+const assert = require('node:assert');
+const { stripAnsi } = require('../lib/ui/utils');
+const { createTurnHandler } = require('../lib/commands/chat-turn');
+// A fake writer + chatHistory that push into ONE shared ordered log. The web
+// summary commits via writerModule.endActivity (from webTracker.flush); the
+// answer commits via chatHistory.finalizeLastMessage. A non-web tool line also
+// commits via endActivity — distinguished by content (formatToolLine is mocked
+// to a recognizable "TOOL:<tag>" string).
+function harness() {
+  const events = [];
+  const writerModule = {
+    startActivity() {},
+    updateActivity() {},
+    endActivity(id, line) {
+      const plain = stripAnsi(String(line));
+      if (/web\b/.test(plain) && /(source|search|web)/.test(plain) && !plain.startsWith('TOOL:')) {
+        events.push({ kind: 'web-summary', line: plain });
+      } else {
+        events.push({ kind: 'tool-line', line: plain });
+      }
+    },
+    scrollback(line) { events.push({ kind: 'scrollback', line: String(line) }); },
+  };
+  const chatHistory = {
+    addMessage() {},
+    streamToken() {},
+    clearStreamingContent() {},
+    // An empty finalize (the suppressed intermediate iteration) commits no
+    // visible answer bubble — only record the non-empty terminal answer, which
+    // is what must land below the web summary.
+    finalizeLastMessage(content) { if (content && content.trim()) events.push({ kind: 'answer', content }); },
+  };
+  const statusBar = {
+    update() {}, onToken() {}, addPendingTokens() {}, updateMetrics() {}, setCost() {},
+  };
+  const inputField = {
+    on() {}, removeListener() {}, releaseNavigation() {}, setDisabled() {},
+  };
+  // Set by each test before invoking the handler.
+  let scenario = async () => {};
+  const runAgentLoop = async (messages, model, maxIter, limit, loopOpts) => {
+    await scenario(loopOpts.callbacks);
+    return { messages, metrics: { turns: [] }, withheldActions: [] };
+  };
+  const ctx = {
+    inputField, statusBar, chatHistory, writerModule, runAgentLoop,
+    getConfig: () => ({ auth_token: 'tok', max_iterations: 50, show_cost: false, system_prompt_mode: 'system_role' }),
+    approxTokens: () => 0,
+    resolveCommand: () => null,
+    opts: {},
+    TAG_REGISTRY: {},
+    formatToolLine: (o) => `TOOL:${o && o.tag}`,
+    collapseListMsg() {}, handlePendingSelection() {}, showPendingStep() {},
+    activateNavCapture() {}, finalizeListMsg() {},
+    createChatIfNeeded: async () => {}, saveTurnToDashboard: async () => {}, saveSession() {},
+    messages: [], currentModel: 'm', debugMode: false, pendingImages: [],
+    chatSync: async () => '', resolvedSystemPrompt: '', resolvedTokenLimit: null, planMode: false,
+  };
+  const handler = createTurnHandler(ctx, {});
+  return { events, handler, setScenario: (fn) => { scenario = fn; } };
+}
+// Helpers to simulate the agent.js per-iteration callback order.
+function webToolIteration(cb, tag, input, meta) {
+  cb.onAssistantMessage('');                 // suppressed (this iteration had a tool call)
+  cb.onToolStart(tag, input, { id: `${tag}-1`, attrs: tag === 'web_search' ? { query: input } : { url: input } });
+  cb.onToolEnd(tag, {}, 120, { id: `${tag}-1`, attrs: tag === 'web_search' ? { query: input } : { url: input }, meta, error: null });
+}
+function indexOfKind(events, kind) { return events.findIndex((e) => e.kind === kind); }
+// ---------------------------------------------------------------------------
+// The regression: single http_get → answer commits the summary BEFORE the answer
+// ---------------------------------------------------------------------------
+test('single http_get → answer: web summary commits before the answer', async () => {
+  const h = harness();
+  h.setScenario(async (cb) => {
+    webToolIteration(cb, 'http_get', 'https://a.example', { status_code: 200, bytes: 1000 });
+    cb.onAssistantMessage('Here is the synthesized answer.');   // final answer iteration
+  });
+  await h.handler('summarize https://a.example');
+  const summaries = h.events.filter((e) => e.kind === 'web-summary');
+  assert.strictEqual(summaries.length, 1, 'exactly one collapsed summary');
+  const iSummary = indexOfKind(h.events, 'web-summary');
+  const iAnswer = indexOfKind(h.events, 'answer');
+  assert.ok(iSummary >= 0 && iAnswer >= 0, 'both committed');
+  assert.ok(iSummary < iAnswer, 'the web summary precedes the answer (the bug being fixed)');
+  assert.match(summaries[0].line, /1 source read/);
+});
+// ---------------------------------------------------------------------------
+// The W.3 guarantee preserved: multi-step search→fetch still collapses to ONE line
+// ---------------------------------------------------------------------------
+test('web_search → http_get → answer: one collapsed line, before the answer; intermediate iteration does NOT flush', async () => {
+  const h = harness();
+  h.setScenario(async (cb) => {
+    // Iteration 1: web_search (separate LLM round-trip from the fetch).
+    webToolIteration(cb, 'web_search', 'corruption scandals', null);
+    // Iteration 2: http_get — its onAssistantMessage('') must NOT flush, else the
+    // single collapsed line would split into two.
+    webToolIteration(cb, 'http_get', 'https://a.example', { status_code: 200, bytes: 1000 });
+    // Iteration 3: the final answer.
+    cb.onAssistantMessage('Final answer with citations.');
+  });
+  await h.handler('research corruption scandals');
+  const summaries = h.events.filter((e) => e.kind === 'web-summary');
+  assert.strictEqual(summaries.length, 1, 'multi-step web activity collapses to exactly ONE line (W.3 guarantee)');
+  const iSummary = indexOfKind(h.events, 'web-summary');
+  const iAnswer = indexOfKind(h.events, 'answer');
+  assert.ok(iSummary < iAnswer, 'the single collapsed summary precedes the answer');
+  // Both the search and the read are reflected in the one line.
+  assert.match(summaries[0].line, /search "corruption scandals"/);
+  assert.match(summaries[0].line, /1 source read/);
+});
+// ---------------------------------------------------------------------------
+// Safety net: an empty / interrupted turn still flushes via the turn-end finally
+// ---------------------------------------------------------------------------
+test('empty/interrupted answer: summary still committed via the turn-end finally', async () => {
+  const h = harness();
+  h.setScenario(async (cb) => {
+    // A turn that did web work but never produced a non-empty assistant message
+    // (e.g. hit the iteration cap, or was interrupted). No final flush in
+    // onAssistantMessage — the `finally` is the safety net.
+    webToolIteration(cb, 'http_get', 'https://a.example', { status_code: 200, bytes: 1000 });
+  });
+  await h.handler('fetch https://a.example');
+  const summaries = h.events.filter((e) => e.kind === 'web-summary');
+  assert.strictEqual(summaries.length, 1, 'the summary is not lost — flushed in finally');
+  assert.strictEqual(indexOfKind(h.events, 'answer'), -1, 'no non-empty answer was finalized');
+});
+// ---------------------------------------------------------------------------
+// Non-web tool after web ops: still flushes via onToolStart (unregressed)
+// ---------------------------------------------------------------------------
+test('non-web tool after web ops: summary flushed before the non-web tool line', async () => {
+  const h = harness();
+  h.setScenario(async (cb) => {
+    // Iteration 1: http_get.
+    webToolIteration(cb, 'http_get', 'https://a.example', { status_code: 200, bytes: 1000 });
+    // Iteration 2: a non-web tool (read_file). Its onToolStart closes the open
+    // web group first (chat-turn.js line 211) so the summary lands above its line.
+    cb.onAssistantMessage('');
+    cb.onToolStart('read_file', '/x', { id: 'rf-1', attrs: { path: '/x' } });
+    cb.onToolEnd('read_file', 'contents', 5, { id: 'rf-1', attrs: { path: '/x' }, meta: null, error: null });
+    // Iteration 3: the answer.
+    cb.onAssistantMessage('Done.');
+  });
+  await h.handler('fetch then read');
+  const summaries = h.events.filter((e) => e.kind === 'web-summary');
+  assert.strictEqual(summaries.length, 1, 'one web summary');
+  const iSummary = indexOfKind(h.events, 'web-summary');
+  const iToolLine = h.events.findIndex((e) => e.kind === 'tool-line' && /read_file/.test(e.line));
+  const iAnswer = indexOfKind(h.events, 'answer');
+  assert.ok(iSummary < iToolLine, 'web summary precedes the non-web tool line (flushed by onToolStart)');
+  assert.ok(iToolLine < iAnswer, 'and both precede the answer');
+});