npm - @semalt-ai/code - Versions diffs - 1.8.5 → 1.20.0 - Mend

@semalt-ai/code 1.8.5 → 1.20.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (192) hide show

package/.claude/settings.local.json +7 -1
package/.github/workflows/ci.yml +69 -0
package/ARCHITECTURE.md +6 -95
package/CLAUDE.md +196 -316
package/README.md +148 -4
package/docs/ARCHITECTURE.md +1321 -0
package/docs/CONFIG.md +340 -0
package/docs/HISTORY.md +245 -0
package/examples/embed.js +74 -0
package/index.js +251 -10
package/lib/agent.js +856 -120
package/lib/api.js +239 -50
package/lib/args.js +74 -2
package/lib/audit.js +23 -1
package/lib/background.js +584 -0
package/lib/checkpoints.js +757 -0
package/lib/commands/auth.js +94 -0
package/lib/commands/chat-session.js +489 -0
package/lib/commands/chat-slash.js +415 -0
package/lib/commands/chat-turn.js +669 -0
package/lib/commands/chat.js +407 -0
package/lib/commands/custom.js +157 -0
package/lib/commands/history-utils.js +66 -0
package/lib/commands/index.js +268 -0
package/lib/commands/mcp.js +113 -0
package/lib/commands/oneshot.js +193 -0
package/lib/commands/registry.js +269 -0
package/lib/commands/tasks.js +89 -0
package/lib/compact.js +87 -0
package/lib/config.js +360 -11
package/lib/constants.js +401 -3
package/lib/deny.js +199 -0
package/lib/doctor.js +160 -0
package/lib/headless.js +202 -0
package/lib/hooks.js +286 -0
package/lib/images.js +270 -0
package/lib/internals.js +49 -0
package/lib/mcp/boundary.js +131 -0
package/lib/mcp/client.js +270 -0
package/lib/mcp/oauth.js +134 -0
package/lib/memory.js +209 -0
package/lib/metrics.js +37 -2
package/lib/payload.js +54 -0
package/lib/permission-rules.js +401 -0
package/lib/permissions.js +123 -26
package/lib/pricing.js +67 -0
package/lib/proc.js +62 -0
package/lib/prompts.js +99 -8
package/lib/sandbox.js +568 -0
package/lib/sdk.js +328 -0
package/lib/secrets.js +211 -0
package/lib/skills.js +223 -0
package/lib/subagents.js +516 -0
package/lib/tool_registry.js +2862 -0
package/lib/tool_specs.js +263 -9
package/lib/tools.js +352 -1039
package/lib/ui/anim.js +86 -0
package/lib/ui/ansi.js +17 -27
package/lib/ui/chat-history.js +253 -71
package/lib/ui/create-ui.js +67 -24
package/lib/ui/diff.js +90 -25
package/lib/ui/file-activity.js +236 -0
package/lib/ui/format.js +195 -29
package/lib/ui/input-field.js +21 -11
package/lib/ui/md-stream.js +234 -0
package/lib/ui/render-operation.js +113 -0
package/lib/ui/select.js +1 -4
package/lib/ui/status-bar.js +146 -36
package/lib/ui/stream.js +20 -13
package/lib/ui/theme.js +190 -44
package/lib/ui/tool-operation.js +190 -0
package/lib/ui/utils.js +9 -5
package/lib/ui/web-activity.js +270 -0
package/lib/ui/writer.js +159 -45
package/lib/ui.js +1 -1
package/lib/verify.js +229 -0
package/lib/web-extract.js +213 -0
package/lib/web-summarize.js +68 -0
package/package.json +19 -4
package/scripts/lint.js +57 -0
package/test/agent-loop.test.js +389 -0
package/test/anim-driver.test.js +153 -0
package/test/ask-user-display.test.js +226 -0
package/test/ask-user-gate.test.js +231 -0
package/test/background.test.js +414 -0
package/test/chat-history-nocolor.test.js +155 -0
package/test/chat-relogin.test.js +207 -0
package/test/chat.test.js +114 -0
package/test/checkpoints-agent.test.js +181 -0
package/test/checkpoints.test.js +650 -0
package/test/command-registry.test.js +160 -0
package/test/compact.test.js +116 -0
package/test/completion-lazy.test.js +52 -0
package/test/config-merge.test.js +324 -0
package/test/config-quarantine.test.js +128 -0
package/test/config-write-guard-allow-anywhere.test.js +56 -0
package/test/config-write-guard-skip.test.js +46 -0
package/test/config-write-guard.test.js +153 -0
package/test/context-split.test.js +215 -0
package/test/cost-doctor.test.js +142 -0
package/test/custom-commands-chat.test.js +106 -0
package/test/custom-commands.test.js +230 -0
package/test/defer-detail-band.test.js +403 -0
package/test/deny-windows.test.js +120 -0
package/test/deny.test.js +83 -0
package/test/detail-band-tab-flatten.test.js +242 -0
package/test/download-allow-anywhere.test.js +66 -0
package/test/download-confine.test.js +153 -0
package/test/exec-diff.test.js +268 -0
package/test/executors.test.js +599 -0
package/test/extract-tool-calls.test.js +349 -0
package/test/fetch-url-validation.test.js +219 -0
package/test/file-activity.test.js +522 -0
package/test/fixtures/tool-calls.js +57 -0
package/test/fixtures/web-page.js +91 -0
package/test/git-tools.test.js +384 -0
package/test/grep-glob-serialize.test.js +242 -0
package/test/grep-glob.test.js +268 -0
package/test/grep-path-target.test.js +227 -0
package/test/harness/README.md +57 -0
package/test/harness/chat-harness.js +143 -0
package/test/harness/memwarn-headless-child.js +65 -0
package/test/harness/mock-llm.js +120 -0
package/test/harness/mock-mcp-server.js +142 -0
package/test/harness/sse-server.js +69 -0
package/test/headless.test.js +348 -0
package/test/history-utils.test.js +88 -0
package/test/hooks-agent.test.js +238 -0
package/test/hooks-verify-sandbox.test.js +232 -0
package/test/hooks.test.js +216 -0
package/test/http-get-user-agent.test.js +142 -0
package/test/images-api.test.js +208 -0
package/test/images.test.js +238 -0
package/test/input-field-ctrl-o.test.js +37 -0
package/test/live-height-physical.test.js +281 -0
package/test/max-iterations.test.js +218 -0
package/test/mcp-boundary.test.js +57 -0
package/test/mcp-client.test.js +267 -0
package/test/mcp-oauth.test.js +86 -0
package/test/md-stream.test.js +183 -0
package/test/memory-truncation-warning.test.js +222 -0
package/test/memory.test.js +198 -0
package/test/native-dispatch.test.js +409 -0
package/test/native-live-narration.test.js +254 -0
package/test/output-chokepoint.test.js +188 -0
package/test/output-heredoc-leak.test.js +195 -0
package/test/output-preview.test.js +245 -0
package/test/path-guards.test.js +134 -0
package/test/payload.test.js +99 -0
package/test/permission-rules-agent.test.js +210 -0
package/test/permission-rules.test.js +297 -0
package/test/permissions.test.js +362 -0
package/test/plan-mode.test.js +167 -0
package/test/read-paginate.test.js +275 -0
package/test/readonly-tools.test.js +177 -0
package/test/render-operation.test.js +317 -0
package/test/replay-descriptor-xml.test.js +216 -0
package/test/replay-descriptor.test.js +189 -0
package/test/replay-web-aggregate.test.js +291 -0
package/test/replay-web-persist.test.js +241 -0
package/test/result-cap.test.js +233 -0
package/test/running-glyph-anim.test.js +111 -0
package/test/sandbox-agent.test.js +147 -0
package/test/sandbox-integration.test.js +216 -0
package/test/sandbox.test.js +408 -0
package/test/sdk.test.js +234 -0
package/test/shell-output-cap.test.js +181 -0
package/test/skills-chat.test.js +110 -0
package/test/skills.test.js +295 -0
package/test/smoke.test.js +68 -0
package/test/status-bar-driver.test.js +93 -0
package/test/status-bar-pause.test.js +164 -0
package/test/status-bar-resync.test.js +188 -0
package/test/stream-parser.test.js +171 -0
package/test/subagents-agent.test.js +178 -0
package/test/subagents.test.js +222 -0
package/test/theme-palette.test.js +166 -0
package/test/tool-registry.test.js +85 -0
package/test/trim-budget.test.js +101 -0
package/test/truncate-visible.test.js +78 -0
package/test/verify-agent.test.js +317 -0
package/test/verify.test.js +141 -0
package/test/view-image.test.js +199 -0
package/test/web-activity-ordering.test.js +203 -0
package/test/web-activity.test.js +207 -0
package/test/web-data-extraction-guidance.test.js +71 -0
package/test/web-extract.test.js +185 -0
package/test/web-fetch-agent.test.js +291 -0
package/test/web-fetch-mode.test.js +193 -0
package/test/web-search.test.js +380 -0
package/lib/commands.js +0 -1438
package/path +0 -1

package/test/trim-budget.test.js ADDED Viewed

@@ -0,0 +1,101 @@
+'use strict';
+// Characterization tests for trimToTokenBudget (Task 1.1).
+// Budgets are derived from the function's own chars/4 estimate so the number of
+// dropped messages is deterministic rather than tuned by hand.
+const { test } = require('node:test');
+const assert = require('node:assert');
+const { trimToTokenBudget } = require('../lib/api');
+// Mirror of the internal estimate: floor(JSON.stringify(msgs).length / 4).
+const est = (msgs) => Math.floor(JSON.stringify(msgs).length / 4);
+const sys = { role: 'system', content: 'SYSTEM PROMPT' };
+const task = { role: 'user', content: 'the original task' };
+const mk = (tag, n) => ({ role: 'user', content: `${tag}:` + 'x'.repeat(n) });
+test('under budget: messages returned unchanged', () => {
+  const msgs = [sys, task, mk('a', 100)];
+  const out = trimToTokenBudget(msgs, est(msgs) + 1000);
+  assert.deepStrictEqual(out, msgs);
+});
+test('no non-system messages: returns a copy of the system messages only', () => {
+  const sys2 = { role: 'system', content: 'second system' };
+  const msgs = [sys, sys2];
+  const out = trimToTokenBudget(msgs, 1);
+  assert.deepStrictEqual(out, [sys, sys2]);
+  assert.notStrictEqual(out, msgs, 'returns a new array');
+});
+test('drops intermediate messages oldest-first, keeping system + pinned + newest', () => {
+  const f1 = mk('f1', 4000);
+  const f2 = mk('f2', 4000);
+  const f3 = mk('f3', 4000);
+  const msgs = [sys, task, f1, f2, f3];
+  // Budget that fits [sys, task, f3] but not [sys, task, f2, f3].
+  const budget = est([sys, task, f3]) + 50;
+  assert.ok(budget < est([sys, task, f2, f3]), 'precondition: budget forces drops');
+  const out = trimToTokenBudget(msgs, budget);
+  assert.deepStrictEqual(out, [sys, task, f3]);
+});
+test('pinned (first non-system) is never dropped even under heavy pressure', () => {
+  const big = mk('tail', 8000);
+  const msgs = [sys, task, big];
+  const out = trimToTokenBudget(msgs, est([sys, task]) + 10);
+  assert.strictEqual(out[0], sys);
+  assert.strictEqual(out[1], task);
+});
+test('truncates the last remaining tail message when still over budget', () => {
+  const huge = mk('tail', 40000);
+  const msgs = [sys, task, huge];
+  // Down to one tail message, but estimate still exceeds budget AND there is
+  // positive room (budget*4 - other - 200 > 0) so truncation engages.
+  const budget = est([sys, task]) + 1500; // ~6000 chars of room for the tail
+  const out = trimToTokenBudget(msgs, budget);
+  assert.strictEqual(out.length, 3);
+  assert.strictEqual(out[0], sys);
+  assert.strictEqual(out[1], task);
+  assert.match(out[2].content, /^\[…content truncated to fit model limit…\]\n/);
+  assert.ok(out[2].content.length < huge.content.length, 'tail was shortened');
+  assert.ok(out[2].content.endsWith('x'.repeat(50)), 'keeps the END of the content');
+});
+test('truncates the pinned message when there is no tail and it overflows', () => {
+  const hugePinned = mk('pinned', 40000);
+  const msgs = [sys, hugePinned];
+  const budget = est([sys]) + 1500;
+  const out = trimToTokenBudget(msgs, budget);
+  assert.strictEqual(out.length, 2);
+  assert.strictEqual(out[0], sys);
+  assert.match(out[1].content, /^\[…content truncated to fit model limit…\]\n/);
+  assert.ok(out[1].content.length < hugePinned.content.length);
+});
+test('QUIRK: when there is no room (budget too small) the last message is left intact', () => {
+  // available = budget*4 - other - 200 <= 0, so the truncation branch is skipped
+  // and the (oversized) message is returned unchanged rather than emptied.
+  const huge = mk('tail', 40000);
+  const msgs = [sys, task, huge];
+  const out = trimToTokenBudget(msgs, 1); // 4 chars of budget
+  assert.deepStrictEqual(out, [sys, task, huge]);
+});
+test('multiple system messages are all preserved and kept ahead of content', () => {
+  const sys2 = { role: 'system', content: 'second system rule' };
+  const f1 = mk('f1', 4000);
+  const f2 = mk('f2', 4000);
+  const msgs = [sys, sys2, task, f1, f2];
+  const out = trimToTokenBudget(msgs, est([sys, sys2, task, f2]) + 50);
+  assert.strictEqual(out[0], sys);
+  assert.strictEqual(out[1], sys2);
+  assert.strictEqual(out[2], task);
+  assert.ok(!out.includes(f1), 'oldest filler dropped');
+});

package/test/truncate-visible.test.js ADDED Viewed

@@ -0,0 +1,78 @@
+'use strict';
+// Direct unit tests for truncateVisible's trailing-reset decision and width
+// math. The trailing `\x1b[0m` must be CONTENT-conditional: appended only when
+// the (possibly truncated) output actually contains an escape, so escape-free
+// output stays escape-free (no NO_COLOR leak) while a cut-open SGR span is
+// still defensively closed (no color bleed).
+const { test } = require('node:test');
+const assert = require('node:assert');
+const { truncateVisible, termWidth } = require('../lib/ui/utils');
+const RST = '\x1b[0m';
+const RED = '\x1b[31m';
+test('escape-free string within budget has no trailing reset', () => {
+  const out = truncateVisible('hello world', 80);
+  assert.strictEqual(out, 'hello world');
+  assert.strictEqual(out.indexOf('\x1b'), -1);
+});
+test('escape-free string truncated mid-string has no trailing reset', () => {
+  const out = truncateVisible('hello world', 5);
+  assert.strictEqual(out, 'hello');
+  assert.strictEqual(out.indexOf('\x1b'), -1);
+});
+test('SGR cut mid-span ends with a reset (bleed-safe)', () => {
+  // Opening SGR is copied through (0 width); the next over-budget glyph breaks
+  // the loop BEFORE that span's own reset → unclosed color span → RST required.
+  const out = truncateVisible(RED + 'colored text', 4);
+  assert.ok(out.indexOf('\x1b') !== -1, 'opener should survive');
+  assert.ok(out.endsWith(RST), 'cut-open span must be closed with a reset');
+  assert.strictEqual(out, RED + 'colo' + RST);
+});
+test('fully-paired SGR within budget round-trips, reset present from content', () => {
+  // opener + content + closer all fit. Because an escape is present we append a
+  // trailing RST (defensive — idempotent on already-reset content).
+  const input = RED + 'hi' + RST;
+  const out = truncateVisible(input, 80);
+  assert.ok(out.startsWith(RED + 'hi' + RST), 'full span preserved');
+  assert.ok(out.endsWith(RST));
+  assert.strictEqual(out, input + RST);
+});
+test('max === 0 returns empty string (no bare reset)', () => {
+  assert.strictEqual(truncateVisible('anything', 0), '');
+  assert.strictEqual(truncateVisible(RED + 'x', 0), '');
+});
+test('empty / falsy input returns empty string', () => {
+  assert.strictEqual(truncateVisible('', 10), '');
+  assert.strictEqual(truncateVisible(null, 10), '');
+  assert.strictEqual(truncateVisible(undefined, 10), '');
+});
+test('CJK glyphs count as width 2 in the truncation column math', () => {
+  // Three ideographs (U+65E5 U+672C U+8A9E), each 2 cols. Budget 4 fits exactly
+  // two; output is escape-free → no trailing reset.
+  const cjk = '日本語';
+  const out = truncateVisible(cjk, 4);
+  assert.strictEqual(out, '日本');
+  assert.strictEqual(out.indexOf('\x1b'), -1);
+  assert.strictEqual(termWidth('日本'), 4);
+});
+test('combining marks count as width 0', () => {
+  // 'e' + combining acute (U+0301): 1 visible column total. Budget 1 keeps both
+  // the base glyph and the zero-width mark. Built from explicit codepoints so
+  // the test does not depend on the source file's Unicode normalization.
+  const input = 'é';
+  assert.strictEqual(termWidth(input), 1);
+  const out = truncateVisible(input, 1);
+  assert.strictEqual(out, input);
+  assert.strictEqual(out.indexOf('\x1b'), -1);
+});

package/test/verify-agent.test.js ADDED Viewed

@@ -0,0 +1,317 @@
+'use strict';
+// Integration tests for self-verification (Task 4.2) driving the REAL
+// runAgentLoop against the mock-LLM harness, with the REAL createVerifyRunner
+// reading config.verify (so spawnSync actually runs the verify command). Verify
+// commands use `node -e …` so they are portable across the CI matrix.
+const { test, before, after } = require('node:test');
+const assert = require('node:assert');
+const fs = require('fs');
+const os = require('os');
+const path = require('path');
+const ui = require('../lib/ui');
+const { createApiClient } = require('../lib/api');
+const { createToolExecutor, extractToolCalls } = require('../lib/tools');
+const { createPermissionManager } = require('../lib/permissions');
+const { createAgentRunner } = require('../lib/agent');
+const { runHeadless } = require('../lib/headless');
+const { startMockLLM } = require('./harness/mock-llm');
+let prevKey;
+before(() => { prevKey = process.env.SEMALT_API_KEY; process.env.SEMALT_API_KEY = 'test-key'; });
+after(() => {
+  if (prevKey === undefined) delete process.env.SEMALT_API_KEY;
+  else process.env.SEMALT_API_KEY = prevKey;
+});
+const NODE = JSON.stringify(process.execPath);
+// buildRunner mirrors hooks-agent.test.js, but threads `verify` into config so
+// the real verify runner (built inside createAgentRunner from getConfig) sees it.
+function buildRunner(base, verify) {
+  const config = {
+    api_base: base, api_key: 'test-key', default_model: 'test-model',
+    temperature: 0.5, request_timeout_ms: 5000, stream: true, models: [],
+    verify: verify || {},
+    // This suite tests verify ORCHESTRATION, not the OS sandbox (covered by
+    // hooks-verify-sandbox.test.js). Disable the sandbox so the verify commands
+    // run deterministically across the CI matrix regardless of bwrap/Seatbelt.
+    sandbox: { mode: 'off' },
+  };
+  const getConfig = () => config;
+  const saveConfig = (c) => Object.assign(config, c);
+  const api = createApiClient({ getConfig, saveConfig, ui });
+  const pm = createPermissionManager(ui, { skipPermissions: true });
+  pm.setUICallbacks({ onAddMessage: () => {}, onShowModal: () => {}, onCloseModal: () => {}, onCaptureNavigation: () => () => {} });
+  const { agentExecShell, agentExecFile, describePermission } = createToolExecutor(pm, ui, getConfig);
+  const runner = createAgentRunner({
+    chatStream: api.chatStream, extractToolCalls, agentExecShell, agentExecFile,
+    describePermission, permissionManager: pm, ui, getConfig,
+  });
+  return { runner, config };
+}
+function collector() {
+  const ev = { errors: [], assistants: [] };
+  const cb = {
+    onError: (e) => ev.errors.push(e),
+    onAssistantMessage: (m) => ev.assistants.push(m),
+  };
+  return { ev, cb };
+}
+function lastFedVerify(messages) {
+  return [...messages].reverse().find((m) => m.role === 'user' && /\[verify/.test(m.content || ''));
+}
+function tmpdir() { return fs.mkdtempSync(path.join(os.tmpdir(), 'semalt-verify-')); }
+// ---------------------------------------------------------------------------
+// 1. Advisory: result fed into context, turn ends regardless of pass/fail
+// ---------------------------------------------------------------------------
+test('advisory verify FAILS: the result is fed into context but the turn still ends', async () => {
+  const verify = { mode: 'advisory', command: `${NODE} -e "process.stdout.write('ADVISORY_FAIL_OUT');process.exit(1)"` };
+  const mock = await startMockLLM();
+  mock.replyWith('Done.');
+  try {
+    const { runner } = buildRunner(mock.base, verify);
+    const { cb } = collector();
+    const messages = [{ role: 'user', content: 'do the task' }];
+    const res = await runner.runAgentLoop(messages, 'test-model', 10, null, { callbacks: cb });
+    assert.strictEqual(res.verifyStatus, 'failed', 'a failing advisory verify reports failed');
+    assert.strictEqual(res.stopReason, 'end_turn', 'advisory NEVER blocks — turn ends normally');
+    assert.strictEqual(res.metrics.turns.length, 1, 'no re-entry into the loop in advisory mode');
+    const fed = lastFedVerify(messages);
+    assert.ok(fed, 'the verify result is fed into context');
+    assert.match(fed.content, /ADVISORY_FAIL_OUT/, 'the command output is present');
+    assert.match(fed.content, /UNTRUSTED_EXTERNAL_CONTENT/, 'verify output is fenced as untrusted');
+  } finally {
+    await mock.close();
+  }
+});
+test('advisory verify PASSES: the result is fed into context and the turn ends', async () => {
+  const verify = { mode: 'advisory', command: `${NODE} -e "process.exit(0)"` };
+  const mock = await startMockLLM();
+  mock.replyWith('Done.');
+  try {
+    const { runner } = buildRunner(mock.base, verify);
+    const { cb } = collector();
+    const messages = [{ role: 'user', content: 'do the task' }];
+    const res = await runner.runAgentLoop(messages, 'test-model', 10, null, { callbacks: cb });
+    assert.strictEqual(res.verifyStatus, 'passed');
+    assert.strictEqual(res.stopReason, 'end_turn');
+    const fed = lastFedVerify(messages);
+    assert.ok(fed, 'a passing advisory verify is still fed into context as information');
+    assert.match(fed.content, /PASSED/);
+  } finally {
+    await mock.close();
+  }
+});
+// ---------------------------------------------------------------------------
+// 2. Enforcing pass: verify passes, the turn ends
+// ---------------------------------------------------------------------------
+test('enforcing verify PASSES on the first try: the turn ends immediately', async () => {
+  const verify = { mode: 'enforcing', command: `${NODE} -e "process.exit(0)"` };
+  const mock = await startMockLLM();
+  mock.replyWith('All done.');
+  try {
+    const { runner } = buildRunner(mock.base, verify);
+    const { cb } = collector();
+    const messages = [{ role: 'user', content: 'do it' }];
+    const res = await runner.runAgentLoop(messages, 'test-model', 10, null, { callbacks: cb });
+    assert.strictEqual(res.verifyStatus, 'passed');
+    assert.strictEqual(res.stopReason, 'end_turn');
+    assert.strictEqual(res.metrics.turns.length, 1, 'passing verify means no re-entry');
+  } finally {
+    await mock.close();
+  }
+});
+// ---------------------------------------------------------------------------
+// 3. Enforcing fail-then-pass: failure re-enters the loop, second attempt passes
+// ---------------------------------------------------------------------------
+test('enforcing verify FAILS then PASSES: the agent is returned to the loop and finishes once verified', async () => {
+  const dir = tmpdir();
+  const marker = path.join(dir, 'mark');
+  const prev = process.env.SEMALT_VERIFY_MARKER;
+  process.env.SEMALT_VERIFY_MARKER = marker;
+  // Fail when the marker is absent (creating it), pass once it exists → exactly
+  // fail-then-pass across two verify runs.
+  const verify = {
+    mode: 'enforcing',
+    command: `${NODE} -e "const fs=require('fs');const f=process.env.SEMALT_VERIFY_MARKER;if(fs.existsSync(f)){process.exit(0)}else{fs.writeFileSync(f,'x');process.exit(1)}"`,
+  };
+  const mock = await startMockLLM();
+  mock.replyWith('Done (attempt 1).');
+  mock.replyWith('Fixed it (attempt 2).');
+  try {
+    const { runner } = buildRunner(mock.base, verify);
+    const { cb } = collector();
+    const messages = [{ role: 'user', content: 'do it' }];
+    const res = await runner.runAgentLoop(messages, 'test-model', 10, null, { callbacks: cb });
+    assert.strictEqual(res.verifyStatus, 'passed', 'final verify passed');
+    assert.strictEqual(res.stopReason, 'end_turn');
+    assert.strictEqual(res.metrics.turns.length, 2, 'the failing verify re-entered the loop once');
+    // The first (failing) verify pushed a corrective, fenced message into context.
+    const reentry = messages.find((m) => m.role === 'user' && /NOT done/.test(m.content || ''));
+    assert.ok(reentry, 'a corrective re-entry message was injected on failure');
+    assert.match(reentry.content, /UNTRUSTED_EXTERNAL_CONTENT/, 'the failing result is fenced as untrusted');
+    assert.ok(messages.some((m) => m.role === 'assistant' && /attempt 2/.test(m.content)));
+  } finally {
+    await mock.close();
+    if (prev === undefined) delete process.env.SEMALT_VERIFY_MARKER;
+    else process.env.SEMALT_VERIFY_MARKER = prev;
+    fs.rmSync(dir, { recursive: true, force: true });
+  }
+});
+// ---------------------------------------------------------------------------
+// 4. Enforcing exhausts: N failures terminate with verify_failed, NOT the iteration cap
+// ---------------------------------------------------------------------------
+test('enforcing verify that never passes terminates with stopReason verify_failed after max_attempts', async () => {
+  const verify = { mode: 'enforcing', command: `${NODE} -e "process.exit(1)"`, max_attempts: 2 };
+  const mock = await startMockLLM();
+  // Queue more replies than the verify-attempt limit to prove we stop on the
+  // attempt limit, not by exhausting the (much larger) iteration cap.
+  mock.replyWith('Try 1.');
+  mock.replyWith('Try 2.');
+  mock.replyWith('Try 3.');
+  try {
+    const { runner } = buildRunner(mock.base, verify);
+    const { cb } = collector();
+    const messages = [{ role: 'user', content: 'do it' }];
+    const res = await runner.runAgentLoop(messages, 'test-model', 50, null, { callbacks: cb });
+    assert.strictEqual(res.stopReason, 'verify_failed', 'precise bound, not max_iterations');
+    assert.strictEqual(res.verifyStatus, 'failed');
+    assert.strictEqual(res.metrics.turns.length, 2, 'stopped after exactly max_attempts (2) failed verifies');
+    assert.ok(mock.pending() >= 1, 'the iteration cap was nowhere near — extra replies left unused');
+  } finally {
+    await mock.close();
+  }
+});
+// ---------------------------------------------------------------------------
+// 5. Timeout treated as a failed verify, no hang
+// ---------------------------------------------------------------------------
+test('a hung verify command times out and is treated as a failed verification (no hang)', async () => {
+  const verify = { mode: 'advisory', command: `${NODE} -e "setTimeout(function(){}, 10000)"`, timeout_ms: 300 };
+  const mock = await startMockLLM();
+  mock.replyWith('Done.');
+  try {
+    const { runner } = buildRunner(mock.base, verify);
+    const { cb } = collector();
+    const messages = [{ role: 'user', content: 'do it' }];
+    const res = await runner.runAgentLoop(messages, 'test-model', 10, null, { callbacks: cb });
+    assert.strictEqual(res.verifyStatus, 'failed', 'timeout is a failed verify');
+    assert.strictEqual(res.stopReason, 'end_turn', 'advisory still ends the turn');
+    const fed = lastFedVerify(messages);
+    assert.match(fed.content, /timed out/i);
+  } finally {
+    await mock.close();
+  }
+});
+// ---------------------------------------------------------------------------
+// 6. Deny-listed verify command is refused
+// ---------------------------------------------------------------------------
+test('a deny-listed verify command is refused (never run) and reported as a failed verify', async () => {
+  const verify = { mode: 'advisory', command: 'rm -rf /' };
+  const mock = await startMockLLM();
+  mock.replyWith('Done.');
+  try {
+    const { runner } = buildRunner(mock.base, verify);
+    const { cb } = collector();
+    const messages = [{ role: 'user', content: 'do it' }];
+    const res = await runner.runAgentLoop(messages, 'test-model', 10, null, { callbacks: cb });
+    assert.strictEqual(res.verifyStatus, 'failed');
+    const fed = lastFedVerify(messages);
+    assert.match(fed.content, /deny-list/i, 'the result explains the command was refused');
+  } finally {
+    await mock.close();
+  }
+});
+// ---------------------------------------------------------------------------
+// 7. --no-verify skips it; no command configured is a no-op
+// ---------------------------------------------------------------------------
+test('--no-verify skips an otherwise-failing enforcing verify; the turn ends as skipped', async () => {
+  const verify = { mode: 'enforcing', command: `${NODE} -e "process.exit(1)"` };
+  const mock = await startMockLLM();
+  mock.replyWith('Done.');
+  try {
+    const { runner } = buildRunner(mock.base, verify);
+    const { cb } = collector();
+    const messages = [{ role: 'user', content: 'do it' }];
+    const res = await runner.runAgentLoop(messages, 'test-model', 10, null, { callbacks: cb, noVerify: true });
+    assert.strictEqual(res.verifyStatus, 'skipped');
+    assert.strictEqual(res.stopReason, 'end_turn');
+    assert.strictEqual(res.metrics.turns.length, 1, 'no verify, no re-entry');
+    assert.ok(!lastFedVerify(messages), 'no verify result fed into context');
+  } finally {
+    await mock.close();
+  }
+});
+test('no command configured is a no-op (skipped), even in enforcing mode', async () => {
+  const verify = { mode: 'enforcing', command: '' };
+  const mock = await startMockLLM();
+  mock.replyWith('Done.');
+  try {
+    const { runner } = buildRunner(mock.base, verify);
+    const { cb } = collector();
+    const messages = [{ role: 'user', content: 'do it' }];
+    const res = await runner.runAgentLoop(messages, 'test-model', 10, null, { callbacks: cb });
+    assert.strictEqual(res.verifyStatus, 'skipped');
+    assert.strictEqual(res.stopReason, 'end_turn');
+  } finally {
+    await mock.close();
+  }
+});
+// ---------------------------------------------------------------------------
+// 8. Headless surfaces verifyStatus
+// ---------------------------------------------------------------------------
+test('headless json output surfaces verifyStatus', async () => {
+  const verify = { mode: 'advisory', command: `${NODE} -e "process.exit(0)"` };
+  const mock = await startMockLLM();
+  mock.replyWith('Done.');
+  try {
+    const { runner } = buildRunner(mock.base, verify);
+    const lines = [];
+    await runHeadless({
+      runAgentLoop: runner.runAgentLoop,
+      messages: [{ role: 'user', content: 'do it' }],
+      model: 'test-model',
+      mode: 'json',
+      maxIterations: 10,
+      write: (s) => lines.push(s),
+    });
+    const objs = lines.join('').split('\n').filter((l) => l.trim()).map((l) => JSON.parse(l));
+    assert.strictEqual(objs.length, 1);
+    assert.strictEqual(objs[0].verifyStatus, 'passed', 'verifyStatus is in the json envelope');
+    assert.strictEqual(objs[0].stopReason, 'end_turn');
+  } finally {
+    await mock.close();
+  }
+});

package/test/verify.test.js ADDED Viewed

@@ -0,0 +1,141 @@
+'use strict';
+// Unit tests for self-verification (Task 4.2) — the pure normalizer and the
+// command runner in lib/verify.js. The runner uses the REAL spawnSync via
+// portable `node -e` commands so exit-code semantics, deny-list refusal, the
+// no-op cases, and untrusted-fencing are all exercised directly (no agent loop).
+const { test } = require('node:test');
+const assert = require('node:assert');
+const { normalizeVerify, createVerifyRunner: _createVerifyRunner } = require('../lib/verify');
+const { DEFAULT_VERIFY_TIMEOUT_MS, DEFAULT_VERIFY_MAX_ATTEMPTS } = require('../lib/constants');
+const NODE = JSON.stringify(process.execPath);
+// These tests exercise verify ORCHESTRATION (deny-list, exit-code semantics,
+// timeout, no-op/skip, fencing) — NOT the OS sandbox, which has its own
+// dedicated tests (hooks-verify-sandbox.test.js). Inject a pass-through sandbox
+// resolver so the command runs plain via the 2-arg spawn(command, opts) form.
+const NO_SANDBOX = (command) => ({ run: true, useShell: true, file: command, args: [], sandbox: 'off' });
+const createVerifyRunner = (opts = {}) => _createVerifyRunner({ sandbox: NO_SANDBOX, ...opts });
+const runnerFor = (verify) => createVerifyRunner({ getConfig: () => ({ verify }) });
+// ---------------------------------------------------------------------------
+// normalizeVerify
+// ---------------------------------------------------------------------------
+test('normalizeVerify: defaults for empty/garbage input', () => {
+  for (const bad of [undefined, null, 42, 'x', [], true]) {
+    assert.deepStrictEqual(normalizeVerify(bad), {
+      mode: 'advisory',
+      command: '',
+      timeout_ms: DEFAULT_VERIFY_TIMEOUT_MS,
+      expected_exit_code: 0,
+      max_attempts: DEFAULT_VERIFY_MAX_ATTEMPTS,
+    });
+  }
+});
+test('normalizeVerify: accepts valid fields, rejects invalid ones', () => {
+  const v = normalizeVerify({
+    mode: 'enforcing', command: '  npm test  ',
+    timeout_ms: 5000, expected_exit_code: 2, max_attempts: 4,
+  });
+  assert.strictEqual(v.mode, 'enforcing');
+  assert.strictEqual(v.command, 'npm test', 'command is trimmed');
+  assert.strictEqual(v.timeout_ms, 5000);
+  assert.strictEqual(v.expected_exit_code, 2);
+  assert.strictEqual(v.max_attempts, 4);
+  // Invalid values fall back to defaults — never unbounded/negative.
+  const bad = normalizeVerify({
+    mode: 'bogus', command: '   ', timeout_ms: 0,
+    expected_exit_code: -1, max_attempts: 0,
+  });
+  assert.strictEqual(bad.mode, 'advisory', 'unknown mode → advisory');
+  assert.strictEqual(bad.command, '', 'blank command → empty (no-op)');
+  assert.strictEqual(bad.timeout_ms, DEFAULT_VERIFY_TIMEOUT_MS);
+  assert.strictEqual(bad.expected_exit_code, 0, 'negative expected exit code rejected');
+  assert.strictEqual(bad.max_attempts, DEFAULT_VERIFY_MAX_ATTEMPTS, 'zero attempts rejected');
+});
+// ---------------------------------------------------------------------------
+// runner — no-op cases
+// ---------------------------------------------------------------------------
+test('run: no command configured is a no-op (skipped)', async () => {
+  const res = await runnerFor({ command: '' }).run();
+  assert.strictEqual(res.skipped, true);
+  assert.strictEqual(res.ran, false);
+});
+test('run: --no-verify short-circuits even with a command configured', async () => {
+  const res = await runnerFor({ command: `${NODE} -e "process.exit(0)"` }).run({ noVerify: true });
+  assert.strictEqual(res.skipped, true);
+  assert.strictEqual(res.ran, false);
+});
+// ---------------------------------------------------------------------------
+// runner — exit-code based success (never stdout parsing)
+// ---------------------------------------------------------------------------
+test('run: exit 0 passes by default', async () => {
+  const res = await runnerFor({ command: `${NODE} -e "process.exit(0)"` }).run();
+  assert.strictEqual(res.passed, true);
+  assert.strictEqual(res.ran, true);
+  assert.strictEqual(res.exitCode, 0);
+  assert.match(res.output, /PASSED/);
+});
+test('run: nonzero exit fails by default', async () => {
+  const res = await runnerFor({ command: `${NODE} -e "process.exit(1)"` }).run();
+  assert.strictEqual(res.passed, false);
+  assert.strictEqual(res.exitCode, 1);
+  assert.match(res.output, /FAILED/);
+});
+test('run: a command that prints "PASS" but exits nonzero still FAILS (exit-code based, not stdout parsing)', async () => {
+  const res = await runnerFor({ command: `${NODE} -e "process.stdout.write('ALL TESTS PASS');process.exit(1)"` }).run();
+  assert.strictEqual(res.passed, false, 'stdout success words do not make a failing exit pass');
+});
+test('run: configurable expected_exit_code', async () => {
+  const res = await runnerFor({ command: `${NODE} -e "process.exit(3)"`, expected_exit_code: 3 }).run();
+  assert.strictEqual(res.passed, true, 'exit matches the expected non-zero code');
+  assert.strictEqual(res.exitCode, 3);
+});
+// ---------------------------------------------------------------------------
+// runner — deny-list, timeout, fencing
+// ---------------------------------------------------------------------------
+test('run: a deny-listed verify command is refused (never run) and reported non-passing', async () => {
+  const res = await runnerFor({ command: 'rm -rf /' }).run();
+  assert.strictEqual(res.passed, false);
+  assert.strictEqual(res.ran, false, 'the command was never executed');
+  assert.ok(res.denied, 'a deny-list label is recorded');
+  assert.match(res.output, /deny-list/i);
+});
+test('run: a hung command times out and is treated as a failed verify (no hang)', async () => {
+  const res = await runnerFor({ command: `${NODE} -e "setTimeout(function(){}, 10000)"`, timeout_ms: 300 }).run();
+  assert.strictEqual(res.timedOut, true);
+  assert.strictEqual(res.passed, false);
+  assert.match(res.output, /timed out/i);
+});
+test('run: output is fenced as untrusted external content', async () => {
+  const res = await runnerFor({ command: `${NODE} -e "process.stdout.write('SENTINEL_OUT_9');process.exit(1)"` }).run();
+  assert.match(res.fenced, /UNTRUSTED_EXTERNAL_CONTENT/, 'fenced with the standard delimiter');
+  assert.match(res.fenced, /SENTINEL_OUT_9/, 'the command output is inside the fence');
+});
+test('run: deny-list and timeout both short-circuit via an injected spawn that is never called', async () => {
+  let spawnCalls = 0;
+  const spy = () => { spawnCalls++; return { status: 0, stdout: '', stderr: '' }; };
+  const runner = createVerifyRunner({ getConfig: () => ({ verify: { command: 'rm -rf /' } }), spawn: spy });
+  const res = await runner.run();
+  assert.strictEqual(spawnCalls, 0, 'a deny-listed command never reaches spawn');
+  assert.ok(res.denied);
+});