@semalt-ai/code 1.8.5 → 1.20.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (192) hide show
  1. package/.claude/settings.local.json +7 -1
  2. package/.github/workflows/ci.yml +69 -0
  3. package/ARCHITECTURE.md +6 -95
  4. package/CLAUDE.md +196 -316
  5. package/README.md +148 -4
  6. package/docs/ARCHITECTURE.md +1321 -0
  7. package/docs/CONFIG.md +340 -0
  8. package/docs/HISTORY.md +245 -0
  9. package/examples/embed.js +74 -0
  10. package/index.js +251 -10
  11. package/lib/agent.js +856 -120
  12. package/lib/api.js +239 -50
  13. package/lib/args.js +74 -2
  14. package/lib/audit.js +23 -1
  15. package/lib/background.js +584 -0
  16. package/lib/checkpoints.js +757 -0
  17. package/lib/commands/auth.js +94 -0
  18. package/lib/commands/chat-session.js +489 -0
  19. package/lib/commands/chat-slash.js +415 -0
  20. package/lib/commands/chat-turn.js +669 -0
  21. package/lib/commands/chat.js +407 -0
  22. package/lib/commands/custom.js +157 -0
  23. package/lib/commands/history-utils.js +66 -0
  24. package/lib/commands/index.js +268 -0
  25. package/lib/commands/mcp.js +113 -0
  26. package/lib/commands/oneshot.js +193 -0
  27. package/lib/commands/registry.js +269 -0
  28. package/lib/commands/tasks.js +89 -0
  29. package/lib/compact.js +87 -0
  30. package/lib/config.js +360 -11
  31. package/lib/constants.js +401 -3
  32. package/lib/deny.js +199 -0
  33. package/lib/doctor.js +160 -0
  34. package/lib/headless.js +202 -0
  35. package/lib/hooks.js +286 -0
  36. package/lib/images.js +270 -0
  37. package/lib/internals.js +49 -0
  38. package/lib/mcp/boundary.js +131 -0
  39. package/lib/mcp/client.js +270 -0
  40. package/lib/mcp/oauth.js +134 -0
  41. package/lib/memory.js +209 -0
  42. package/lib/metrics.js +37 -2
  43. package/lib/payload.js +54 -0
  44. package/lib/permission-rules.js +401 -0
  45. package/lib/permissions.js +123 -26
  46. package/lib/pricing.js +67 -0
  47. package/lib/proc.js +62 -0
  48. package/lib/prompts.js +99 -8
  49. package/lib/sandbox.js +568 -0
  50. package/lib/sdk.js +328 -0
  51. package/lib/secrets.js +211 -0
  52. package/lib/skills.js +223 -0
  53. package/lib/subagents.js +516 -0
  54. package/lib/tool_registry.js +2862 -0
  55. package/lib/tool_specs.js +263 -9
  56. package/lib/tools.js +352 -1039
  57. package/lib/ui/anim.js +86 -0
  58. package/lib/ui/ansi.js +17 -27
  59. package/lib/ui/chat-history.js +253 -71
  60. package/lib/ui/create-ui.js +67 -24
  61. package/lib/ui/diff.js +90 -25
  62. package/lib/ui/file-activity.js +236 -0
  63. package/lib/ui/format.js +195 -29
  64. package/lib/ui/input-field.js +21 -11
  65. package/lib/ui/md-stream.js +234 -0
  66. package/lib/ui/render-operation.js +113 -0
  67. package/lib/ui/select.js +1 -4
  68. package/lib/ui/status-bar.js +146 -36
  69. package/lib/ui/stream.js +20 -13
  70. package/lib/ui/theme.js +190 -44
  71. package/lib/ui/tool-operation.js +190 -0
  72. package/lib/ui/utils.js +9 -5
  73. package/lib/ui/web-activity.js +270 -0
  74. package/lib/ui/writer.js +159 -45
  75. package/lib/ui.js +1 -1
  76. package/lib/verify.js +229 -0
  77. package/lib/web-extract.js +213 -0
  78. package/lib/web-summarize.js +68 -0
  79. package/package.json +19 -4
  80. package/scripts/lint.js +57 -0
  81. package/test/agent-loop.test.js +389 -0
  82. package/test/anim-driver.test.js +153 -0
  83. package/test/ask-user-display.test.js +226 -0
  84. package/test/ask-user-gate.test.js +231 -0
  85. package/test/background.test.js +414 -0
  86. package/test/chat-history-nocolor.test.js +155 -0
  87. package/test/chat-relogin.test.js +207 -0
  88. package/test/chat.test.js +114 -0
  89. package/test/checkpoints-agent.test.js +181 -0
  90. package/test/checkpoints.test.js +650 -0
  91. package/test/command-registry.test.js +160 -0
  92. package/test/compact.test.js +116 -0
  93. package/test/completion-lazy.test.js +52 -0
  94. package/test/config-merge.test.js +324 -0
  95. package/test/config-quarantine.test.js +128 -0
  96. package/test/config-write-guard-allow-anywhere.test.js +56 -0
  97. package/test/config-write-guard-skip.test.js +46 -0
  98. package/test/config-write-guard.test.js +153 -0
  99. package/test/context-split.test.js +215 -0
  100. package/test/cost-doctor.test.js +142 -0
  101. package/test/custom-commands-chat.test.js +106 -0
  102. package/test/custom-commands.test.js +230 -0
  103. package/test/defer-detail-band.test.js +403 -0
  104. package/test/deny-windows.test.js +120 -0
  105. package/test/deny.test.js +83 -0
  106. package/test/detail-band-tab-flatten.test.js +242 -0
  107. package/test/download-allow-anywhere.test.js +66 -0
  108. package/test/download-confine.test.js +153 -0
  109. package/test/exec-diff.test.js +268 -0
  110. package/test/executors.test.js +599 -0
  111. package/test/extract-tool-calls.test.js +349 -0
  112. package/test/fetch-url-validation.test.js +219 -0
  113. package/test/file-activity.test.js +522 -0
  114. package/test/fixtures/tool-calls.js +57 -0
  115. package/test/fixtures/web-page.js +91 -0
  116. package/test/git-tools.test.js +384 -0
  117. package/test/grep-glob-serialize.test.js +242 -0
  118. package/test/grep-glob.test.js +268 -0
  119. package/test/grep-path-target.test.js +227 -0
  120. package/test/harness/README.md +57 -0
  121. package/test/harness/chat-harness.js +143 -0
  122. package/test/harness/memwarn-headless-child.js +65 -0
  123. package/test/harness/mock-llm.js +120 -0
  124. package/test/harness/mock-mcp-server.js +142 -0
  125. package/test/harness/sse-server.js +69 -0
  126. package/test/headless.test.js +348 -0
  127. package/test/history-utils.test.js +88 -0
  128. package/test/hooks-agent.test.js +238 -0
  129. package/test/hooks-verify-sandbox.test.js +232 -0
  130. package/test/hooks.test.js +216 -0
  131. package/test/http-get-user-agent.test.js +142 -0
  132. package/test/images-api.test.js +208 -0
  133. package/test/images.test.js +238 -0
  134. package/test/input-field-ctrl-o.test.js +37 -0
  135. package/test/live-height-physical.test.js +281 -0
  136. package/test/max-iterations.test.js +218 -0
  137. package/test/mcp-boundary.test.js +57 -0
  138. package/test/mcp-client.test.js +267 -0
  139. package/test/mcp-oauth.test.js +86 -0
  140. package/test/md-stream.test.js +183 -0
  141. package/test/memory-truncation-warning.test.js +222 -0
  142. package/test/memory.test.js +198 -0
  143. package/test/native-dispatch.test.js +409 -0
  144. package/test/native-live-narration.test.js +254 -0
  145. package/test/output-chokepoint.test.js +188 -0
  146. package/test/output-heredoc-leak.test.js +195 -0
  147. package/test/output-preview.test.js +245 -0
  148. package/test/path-guards.test.js +134 -0
  149. package/test/payload.test.js +99 -0
  150. package/test/permission-rules-agent.test.js +210 -0
  151. package/test/permission-rules.test.js +297 -0
  152. package/test/permissions.test.js +362 -0
  153. package/test/plan-mode.test.js +167 -0
  154. package/test/read-paginate.test.js +275 -0
  155. package/test/readonly-tools.test.js +177 -0
  156. package/test/render-operation.test.js +317 -0
  157. package/test/replay-descriptor-xml.test.js +216 -0
  158. package/test/replay-descriptor.test.js +189 -0
  159. package/test/replay-web-aggregate.test.js +291 -0
  160. package/test/replay-web-persist.test.js +241 -0
  161. package/test/result-cap.test.js +233 -0
  162. package/test/running-glyph-anim.test.js +111 -0
  163. package/test/sandbox-agent.test.js +147 -0
  164. package/test/sandbox-integration.test.js +216 -0
  165. package/test/sandbox.test.js +408 -0
  166. package/test/sdk.test.js +234 -0
  167. package/test/shell-output-cap.test.js +181 -0
  168. package/test/skills-chat.test.js +110 -0
  169. package/test/skills.test.js +295 -0
  170. package/test/smoke.test.js +68 -0
  171. package/test/status-bar-driver.test.js +93 -0
  172. package/test/status-bar-pause.test.js +164 -0
  173. package/test/status-bar-resync.test.js +188 -0
  174. package/test/stream-parser.test.js +171 -0
  175. package/test/subagents-agent.test.js +178 -0
  176. package/test/subagents.test.js +222 -0
  177. package/test/theme-palette.test.js +166 -0
  178. package/test/tool-registry.test.js +85 -0
  179. package/test/trim-budget.test.js +101 -0
  180. package/test/truncate-visible.test.js +78 -0
  181. package/test/verify-agent.test.js +317 -0
  182. package/test/verify.test.js +141 -0
  183. package/test/view-image.test.js +199 -0
  184. package/test/web-activity-ordering.test.js +203 -0
  185. package/test/web-activity.test.js +207 -0
  186. package/test/web-data-extraction-guidance.test.js +71 -0
  187. package/test/web-extract.test.js +185 -0
  188. package/test/web-fetch-agent.test.js +291 -0
  189. package/test/web-fetch-mode.test.js +193 -0
  190. package/test/web-search.test.js +380 -0
  191. package/lib/commands.js +0 -1438
  192. package/path +0 -1
@@ -0,0 +1,142 @@
1
+ #!/usr/bin/env node
2
+ 'use strict';
3
+
4
+ // Mock MCP server over stdio (Task 3.3).
5
+ // ----------------------------------------------------------------------------
6
+ // A tiny, dependency-free, spec-correct MCP server the tests spawn as a local
7
+ // subprocess (no network). It speaks newline-delimited JSON-RPC 2.0 on
8
+ // stdin/stdout — exactly the framing `StdioClientTransport` reads — so the REAL
9
+ // MCP SDK client connects to it and the whole discovery/dispatch path is
10
+ // exercised against a deterministic peer.
11
+ //
12
+ // Implemented methods: `initialize`, `notifications/initialized` (ignored),
13
+ // `tools/list`, `tools/call`, `ping`. It deliberately does NOT depend on the
14
+ // SDK server classes (which require zod schemas) — the raw protocol is ~60
15
+ // lines and keeps the test self-contained and reproducible.
16
+ //
17
+ // Behavior knobs (env vars, so a test can script edge cases):
18
+ // MOCK_MCP_NAME server name advertised in initialize (default "mock")
19
+ // MOCK_MCP_EXIT_ON_START if set, exit(1) immediately — simulates a server
20
+ // that dies on launch (graceful-degradation test).
21
+ //
22
+ // Tools exposed:
23
+ // echo { text } → returns the text back (untrusted-content payloads
24
+ // can be injected here to test the delimiter).
25
+ // add { a, b } → returns a+b.
26
+ // boom {} → returns an MCP tool-level error (isError: true).
27
+
28
+ const SERVER_NAME = process.env.MOCK_MCP_NAME || 'mock';
29
+
30
+ const TOOLS = [
31
+ {
32
+ name: 'echo',
33
+ description: 'Echo the provided text back verbatim.',
34
+ inputSchema: {
35
+ type: 'object',
36
+ properties: { text: { type: 'string', description: 'Text to echo' } },
37
+ required: ['text'],
38
+ },
39
+ },
40
+ {
41
+ name: 'add',
42
+ description: 'Add two numbers and return the sum.',
43
+ inputSchema: {
44
+ type: 'object',
45
+ properties: {
46
+ a: { type: 'number', description: 'First addend' },
47
+ b: { type: 'number', description: 'Second addend' },
48
+ },
49
+ required: ['a', 'b'],
50
+ },
51
+ },
52
+ {
53
+ name: 'boom',
54
+ description: 'Always reports a tool-level error.',
55
+ inputSchema: { type: 'object', properties: {} },
56
+ },
57
+ ];
58
+
59
+ function send(msg) {
60
+ process.stdout.write(JSON.stringify(msg) + '\n');
61
+ }
62
+
63
+ function ok(id, result) {
64
+ send({ jsonrpc: '2.0', id, result });
65
+ }
66
+
67
+ function err(id, code, message) {
68
+ send({ jsonrpc: '2.0', id, error: { code, message } });
69
+ }
70
+
71
+ function callTool(name, args) {
72
+ if (name === 'echo') {
73
+ return { content: [{ type: 'text', text: String((args && args.text) ?? '') }] };
74
+ }
75
+ if (name === 'add') {
76
+ const sum = Number((args && args.a) || 0) + Number((args && args.b) || 0);
77
+ return { content: [{ type: 'text', text: String(sum) }] };
78
+ }
79
+ if (name === 'boom') {
80
+ return { content: [{ type: 'text', text: 'the boom tool failed as designed' }], isError: true };
81
+ }
82
+ return null; // unknown tool
83
+ }
84
+
85
+ function handle(msg) {
86
+ const { id, method, params } = msg;
87
+ if (method === 'initialize') {
88
+ // Echo the client's requested protocol version — by definition one the
89
+ // client supports — and advertise the tools capability.
90
+ ok(id, {
91
+ protocolVersion: (params && params.protocolVersion) || '2025-06-18',
92
+ capabilities: { tools: {} },
93
+ serverInfo: { name: SERVER_NAME, version: '1.0.0' },
94
+ });
95
+ return;
96
+ }
97
+ if (method === 'notifications/initialized') return; // notification, no reply
98
+ if (method === 'ping') { ok(id, {}); return; }
99
+ if (method === 'tools/list') { ok(id, { tools: TOOLS }); return; }
100
+ if (method === 'tools/call') {
101
+ const name = params && params.name;
102
+ const result = callTool(name, params && params.arguments);
103
+ if (!result) { err(id, -32602, `Unknown tool: ${name}`); return; }
104
+ ok(id, result);
105
+ return;
106
+ }
107
+ if (id !== undefined) err(id, -32601, `Method not found: ${method}`);
108
+ }
109
+
110
+ function run() {
111
+ // Behavior knob: simulate a server that dies on launch (degradation test).
112
+ if (process.env.MOCK_MCP_EXIT_ON_START) {
113
+ process.stderr.write('mock-mcp-server: simulated startup failure\n');
114
+ process.exit(1);
115
+ }
116
+ let buffer = '';
117
+ process.stdin.setEncoding('utf8');
118
+ process.stdin.on('data', (chunk) => {
119
+ buffer += chunk;
120
+ let nl;
121
+ while ((nl = buffer.indexOf('\n')) !== -1) {
122
+ const line = buffer.slice(0, nl).trim();
123
+ buffer = buffer.slice(nl + 1);
124
+ if (!line) continue;
125
+ let msg;
126
+ try { msg = JSON.parse(line); } catch { continue; }
127
+ try { handle(msg); } catch (e) {
128
+ if (msg && msg.id !== undefined) err(msg.id, -32603, e.message);
129
+ }
130
+ }
131
+ });
132
+ process.stdin.on('end', () => process.exit(0));
133
+ }
134
+
135
+ // Only attach the stdin server loop when spawned directly (as the MCP client
136
+ // does). The Node test runner discovers and EXECUTES every file under test/ —
137
+ // including this one — each in a child process where `require.main === module`
138
+ // is also true. It sets NODE_TEST_CONTEXT in that child, so we use its ABSENCE
139
+ // (plus require.main) to mean "spawned as a real MCP server"; otherwise this
140
+ // stdin loop would hang the test runner forever waiting on input.
141
+ if (require.main === module && !process.env.NODE_TEST_CONTEXT) run();
142
+
@@ -0,0 +1,69 @@
1
+ 'use strict';
2
+
3
+ // Minimal scriptable SSE server for streaming-parser tests (Task 1.1).
4
+ // Task 1.2 extends this into the full mock-LLM harness (queues, status codes,
5
+ // Retry-After, delays); for now it serves one scripted response per request.
6
+ //
7
+ // Usage:
8
+ // const srv = await startSseServer({ chunks: ['data: {...}\n', 'data: [DONE]\n'] });
9
+ // // point config.api_base at srv.base, make the request, then:
10
+ // await srv.close();
11
+
12
+ const http = require('http');
13
+
14
+ // opts:
15
+ // chunks string[] — written sequentially (with a tiny gap) so the client's
16
+ // cross-chunk line buffering is genuinely exercised. A single string
17
+ // is also accepted and sent as one chunk.
18
+ // status HTTP status code (default 200).
19
+ // headers extra response headers (merged over the SSE defaults).
20
+ // gapMs delay between chunks (default 4ms).
21
+ function startSseServer(opts = {}) {
22
+ const status = opts.status || 200;
23
+ const gapMs = opts.gapMs == null ? 4 : opts.gapMs;
24
+ const chunks = Array.isArray(opts.chunks)
25
+ ? opts.chunks
26
+ : [opts.body != null ? opts.body : ''];
27
+
28
+ const server = http.createServer((req, res) => {
29
+ // Drain the request body before responding.
30
+ req.resume();
31
+ req.on('end', () => {
32
+ res.writeHead(status, {
33
+ 'Content-Type': 'text/event-stream',
34
+ 'Cache-Control': 'no-cache',
35
+ Connection: 'keep-alive',
36
+ ...(opts.headers || {}),
37
+ });
38
+ let i = 0;
39
+ const writeNext = () => {
40
+ if (i >= chunks.length) { res.end(); return; }
41
+ res.write(chunks[i++]);
42
+ if (gapMs > 0) setTimeout(writeNext, gapMs);
43
+ else writeNext();
44
+ };
45
+ writeNext();
46
+ });
47
+ });
48
+
49
+ return new Promise((resolve) => {
50
+ server.listen(0, '127.0.0.1', () => {
51
+ const { port } = server.address();
52
+ resolve({
53
+ server,
54
+ port,
55
+ base: `http://127.0.0.1:${port}`,
56
+ close: () => new Promise((r) => server.close(r)),
57
+ });
58
+ });
59
+ });
60
+ }
61
+
62
+ // Build a `data: {json}\n` SSE line.
63
+ function sse(obj) {
64
+ return `data: ${JSON.stringify(obj)}\n`;
65
+ }
66
+
67
+ const DONE = 'data: [DONE]\n';
68
+
69
+ module.exports = { startSseServer, sse, DONE };
@@ -0,0 +1,348 @@
1
+ 'use strict';
2
+
3
+ // Headless output tests (Task 2.4). Drives the REAL runAgentLoop against the
4
+ // mock LLM through runHeadless in each output mode, capturing stdout. Proves:
5
+ // * json → a single JSON object { result, toolCalls, usage, cost }
6
+ // * stream-json → newline-delimited JSON events (assistant / tool / result)
7
+ // * machine modes are byte-pure: no ANSI escapes leak, even though a tool runs
8
+ // (write_file's green ✓ and the permission diff would otherwise print).
9
+
10
+ const { test, before, after } = require('node:test');
11
+ const assert = require('node:assert');
12
+ const os = require('node:os');
13
+ const fs = require('node:fs');
14
+ const path = require('node:path');
15
+
16
+ const ui = require('../lib/ui');
17
+ const { createApiClient } = require('../lib/api');
18
+ const { createToolExecutor, extractToolCalls } = require('../lib/tools');
19
+ const { createPermissionManager } = require('../lib/permissions');
20
+ const { createAgentRunner } = require('../lib/agent');
21
+ const { startMockLLM } = require('./harness/mock-llm');
22
+ const { runHeadless, usageFromMetrics, finalResult, isMachineMode, createHeadlessSink } = require('../lib/headless');
23
+
24
+ let prevKey;
25
+ let CWD;
26
+ let PREV_CWD;
27
+ before(() => {
28
+ prevKey = process.env.SEMALT_API_KEY; process.env.SEMALT_API_KEY = 'test-key';
29
+ PREV_CWD = process.cwd();
30
+ CWD = fs.realpathSync(fs.mkdtempSync(path.join(os.tmpdir(), 'semalt-headless-')));
31
+ process.chdir(CWD);
32
+ });
33
+ after(() => {
34
+ process.chdir(PREV_CWD);
35
+ if (prevKey === undefined) delete process.env.SEMALT_API_KEY; else process.env.SEMALT_API_KEY = prevKey;
36
+ });
37
+
38
+ function buildRunner(base) {
39
+ const config = {
40
+ api_base: base, api_key: 'test-key', default_model: 'test-model',
41
+ temperature: 0.5, request_timeout_ms: 5000, stream: true, models: [],
42
+ };
43
+ const getConfig = () => config;
44
+ const saveConfig = () => {};
45
+ const api = createApiClient({ getConfig, saveConfig, ui });
46
+ const pm = createPermissionManager(ui, { skipPermissions: true });
47
+ pm.setUICallbacks({ onAddMessage: () => {}, onShowModal: () => {}, onCloseModal: () => {}, onCaptureNavigation: () => () => {} });
48
+ const { agentExecShell, agentExecFile, describePermission } = createToolExecutor(pm, ui, getConfig);
49
+ return createAgentRunner({
50
+ chatStream: api.chatStream, extractToolCalls, agentExecShell, agentExecFile,
51
+ describePermission, permissionManager: pm, ui, getConfig,
52
+ });
53
+ }
54
+
55
+ // Capture the headless JSON via the injectable `write` sink — no global stdout
56
+ // swap (which would collide with the node:test TAP reporter). Chrome is
57
+ // suppressed inside runHeadless via setUIActive, so the sink receives ONLY the
58
+ // structured output.
59
+ async function runCapture(mode, runner, messages) {
60
+ const chunks = [];
61
+ const res = await runHeadless({
62
+ runAgentLoop: runner.runAgentLoop,
63
+ messages, model: 'test-model', mode, maxIterations: 10,
64
+ agentOpts: { systemPromptMode: 'system_role' },
65
+ write: (s) => { chunks.push(typeof s === 'string' ? s : s.toString('utf8')); return true; },
66
+ });
67
+ return { out: chunks.join(''), res };
68
+ }
69
+
70
+ const HAS_ANSI = (s) => /\x1b\[/.test(s);
71
+
72
+ // ---------------------------------------------------------------------------
73
+ // Pure helpers
74
+ // ---------------------------------------------------------------------------
75
+
76
+ test('usageFromMetrics aggregates turns', () => {
77
+ const metrics = { turns: [
78
+ { promptTokens: 10, completionTokens: 4 },
79
+ { promptTokens: 20, completionTokens: 6 },
80
+ ] };
81
+ assert.deepStrictEqual(usageFromMetrics(metrics), {
82
+ prompt_tokens: 30, completion_tokens: 10, total_tokens: 40, context_tokens: 20,
83
+ context_base_est: 0, context_working_est: 0, turns: 2,
84
+ });
85
+ });
86
+
87
+ test('finalResult returns the last assistant message', () => {
88
+ const messages = [
89
+ { role: 'user', content: 'hi' },
90
+ { role: 'assistant', content: 'first' },
91
+ { role: 'user', content: 'tool results' },
92
+ { role: 'assistant', content: 'final answer' },
93
+ ];
94
+ assert.strictEqual(finalResult(messages, []), 'final answer');
95
+ });
96
+
97
+ test('isMachineMode classifies the formats', () => {
98
+ assert.strictEqual(isMachineMode('json'), true);
99
+ assert.strictEqual(isMachineMode('stream-json'), true);
100
+ assert.strictEqual(isMachineMode('text'), false);
101
+ });
102
+
103
+ // ---------------------------------------------------------------------------
104
+ // json mode
105
+ // ---------------------------------------------------------------------------
106
+
107
+ test('json mode prints exactly one JSON object with the documented shape', async () => {
108
+ const mock = await startMockLLM();
109
+ mock.replyWith('<write_file path="hl.txt">hi</write_file>', { usage: { prompt_tokens: 30, completion_tokens: 10 } });
110
+ mock.replyWith('Wrote it.', { usage: { prompt_tokens: 12, completion_tokens: 7 } });
111
+ try {
112
+ const runner = buildRunner(mock.base);
113
+ const { out } = await runCapture('json', runner, [{ role: 'user', content: 'write a file' }]);
114
+
115
+ assert.ok(!HAS_ANSI(out), 'no ANSI escapes leak into json stdout');
116
+ const lines = out.split('\n').filter((l) => l.trim());
117
+ assert.strictEqual(lines.length, 1, 'exactly one line of output');
118
+ const obj = JSON.parse(lines[0]);
119
+ assert.deepStrictEqual(Object.keys(obj).sort(), ['cost', 'result', 'stopReason', 'toolCalls', 'usage', 'verifyStatus']);
120
+ assert.strictEqual(obj.result, 'Wrote it.');
121
+ assert.strictEqual(obj.stopReason, 'end_turn', 'natural completion reports end_turn');
122
+ assert.strictEqual(obj.verifyStatus, 'skipped', 'no verify command configured → skipped');
123
+ assert.strictEqual(obj.cost, null);
124
+ assert.strictEqual(obj.usage.total_tokens, 59); // 30+10 (tool turn) + 12+7 (final)
125
+ assert.strictEqual(obj.usage.turns, 2);
126
+ assert.strictEqual(obj.toolCalls.length, 1, 'the write_file tool call is recorded');
127
+ assert.strictEqual(obj.toolCalls[0].tool, 'write');
128
+ assert.deepStrictEqual(obj.toolCalls[0].args, ['hl.txt', 'hi']);
129
+ assert.strictEqual(obj.toolCalls[0].ok, true);
130
+ assert.strictEqual(fs.readFileSync(path.join(CWD, 'hl.txt'), 'utf8'), 'hi', 'tool actually executed');
131
+ } finally {
132
+ await mock.close();
133
+ }
134
+ });
135
+
136
+ test('json mode computes cost from the price table when the model is priced', async () => {
137
+ const mock = await startMockLLM();
138
+ mock.replyWith('Answer.', { usage: { prompt_tokens: 1_000_000, completion_tokens: 1_000_000 } });
139
+ try {
140
+ const runner = buildRunner(mock.base);
141
+ const chunks = [];
142
+ await runHeadless({
143
+ runAgentLoop: runner.runAgentLoop, messages: [{ role: 'user', content: 'q' }],
144
+ model: 'test-model', mode: 'json', maxIterations: 10,
145
+ agentOpts: { systemPromptMode: 'system_role' },
146
+ priceOverrides: { 'test-model': { input: 2, output: 8 } }, // per Mtok
147
+ write: (s) => { chunks.push(s); return true; },
148
+ });
149
+ const obj = JSON.parse(chunks.join('').trim());
150
+ assert.strictEqual(obj.cost, 10, '1M in × $2 + 1M out × $8 = $10');
151
+ } finally {
152
+ await mock.close();
153
+ }
154
+ });
155
+
156
+ // ---------------------------------------------------------------------------
157
+ // stream-json mode
158
+ // ---------------------------------------------------------------------------
159
+
160
+ test('stream-json mode emits valid NDJSON events (assistant / tool / result)', async () => {
161
+ const mock = await startMockLLM();
162
+ mock.replyWith('<write_file path="s.txt">x</write_file>', { usage: { prompt_tokens: 3, completion_tokens: 1 } });
163
+ mock.replyWith('Done streaming.', { usage: { prompt_tokens: 5, completion_tokens: 2 } });
164
+ try {
165
+ const runner = buildRunner(mock.base);
166
+ const { out } = await runCapture('stream-json', runner, [{ role: 'user', content: 'go' }]);
167
+
168
+ assert.ok(!HAS_ANSI(out), 'no ANSI escapes leak into stream-json stdout');
169
+ const events = out.split('\n').filter((l) => l.trim()).map((l) => JSON.parse(l));
170
+ const types = events.map((e) => e.type);
171
+ assert.ok(types.includes('tool'), 'a tool event was emitted');
172
+ assert.ok(types.includes('result'), 'a terminal result event was emitted');
173
+ assert.strictEqual(types[types.length - 1], 'result', 'result is last');
174
+
175
+ const toolEvent = events.find((e) => e.type === 'tool');
176
+ assert.strictEqual(toolEvent.tool, 'write');
177
+ assert.deepStrictEqual(toolEvent.args, ['s.txt', 'x']);
178
+
179
+ const resultEvent = events[events.length - 1];
180
+ assert.strictEqual(resultEvent.result, 'Done streaming.');
181
+ assert.strictEqual(resultEvent.cost, null);
182
+ assert.strictEqual(resultEvent.usage.total_tokens, 11); // 3+1 + 5+2
183
+ } finally {
184
+ await mock.close();
185
+ }
186
+ });
187
+
188
+ // ---------------------------------------------------------------------------
189
+ // text mode does not emit machine output
190
+ // ---------------------------------------------------------------------------
191
+
192
+ test('text mode returns the loop result and emits no JSON envelope', async () => {
193
+ const mock = await startMockLLM();
194
+ mock.replyWith('Just a plain answer.');
195
+ try {
196
+ const runner = buildRunner(mock.base);
197
+ const { out, res } = await runCapture('text', runner, [{ role: 'user', content: 'hello' }]);
198
+ assert.ok(res && Array.isArray(res.messages), 'runHeadless returns the loop result');
199
+ assert.ok(!/"result"\s*:/.test(out), 'text mode does not print a json result envelope');
200
+ } finally {
201
+ await mock.close();
202
+ }
203
+ });
204
+
205
+ // ---------------------------------------------------------------------------
206
+ // Phase 6d-ii — descriptor-derived ADDITIVE enrichment of per-tool recs
207
+ // ---------------------------------------------------------------------------
208
+
209
+ // The legacy fields every per-tool rec MUST carry byte-identically (names,
210
+ // types, values) — the public contract pin guarded by the characterization test.
211
+ const LEGACY_KEYS = ['tool', 'args', 'ok', 'ms'];
212
+ // The descriptor-derived fields 6d-ii adds. Present additively; never replace a
213
+ // legacy field. (`tag`/`noDuration`/`error` also ride along from the core.)
214
+ const ENRICHED_KEYS = ['status', 'category', 'durationMs', 'detail', 'meta', 'target', 'attrs'];
215
+
216
+ // THE GUARD (§4.1) — characterizes a representative run end-to-end: legacy fields
217
+ // stay byte-identical in the stream {type:'tool'} event AND the toolCalls[] entry;
218
+ // the additive enriched fields are present with correct values; and the FINALIZE
219
+ // top-level key-set is byte-identical (no new top-level finalize key).
220
+ test('per-tool recs are additively enriched; legacy + finalize key-sets unchanged', async () => {
221
+ const mock = await startMockLLM();
222
+ // Two runs below (json + stream-json); the FIFO queue needs a pair each.
223
+ mock.replyWith('<write_file path="enrich.txt">hi</write_file>', { usage: { prompt_tokens: 3, completion_tokens: 1 } });
224
+ mock.replyWith('Done.', { usage: { prompt_tokens: 2, completion_tokens: 1 } });
225
+ mock.replyWith('<write_file path="enrich.txt">hi</write_file>', { usage: { prompt_tokens: 3, completion_tokens: 1 } });
226
+ mock.replyWith('Done.', { usage: { prompt_tokens: 2, completion_tokens: 1 } });
227
+ try {
228
+ const runner = buildRunner(mock.base);
229
+ // json mode → finalize object with toolCalls[]; stream-json → tool events.
230
+ const { out: jsonOut } = await runCapture('json', runner, [{ role: 'user', content: 'go' }]);
231
+ const finalObj = JSON.parse(jsonOut.split('\n').filter((l) => l.trim())[0]);
232
+
233
+ // Finalize top-level key-set BYTE-IDENTICAL (no new top-level key from 6d-ii).
234
+ assert.deepStrictEqual(
235
+ Object.keys(finalObj).sort(),
236
+ ['cost', 'result', 'stopReason', 'toolCalls', 'usage', 'verifyStatus'],
237
+ );
238
+
239
+ const arrEntry = finalObj.toolCalls[0];
240
+ // Legacy fields byte-identical in the toolCalls[] entry.
241
+ assert.strictEqual(arrEntry.tool, 'write');
242
+ assert.deepStrictEqual(arrEntry.args, ['enrich.txt', 'hi']);
243
+ assert.strictEqual(arrEntry.ok, true);
244
+ assert.strictEqual(typeof arrEntry.ms, 'number');
245
+ // Enriched fields present with correct values.
246
+ for (const k of ENRICHED_KEYS) assert.ok(k in arrEntry, `toolCalls entry has additive ${k}`);
247
+ assert.strictEqual(arrEntry.status, 'ok');
248
+ assert.strictEqual(arrEntry.category, 'file');
249
+ assert.strictEqual(arrEntry.target, 'enrich.txt');
250
+ assert.strictEqual(arrEntry.durationMs, arrEntry.ms, 'durationMs mirrors ms');
251
+ assert.strictEqual(arrEntry.detail.kind, 'diff', 'write carries a structured diff detail');
252
+ assert.strictEqual(arrEntry.detail.payload.after, 'hi');
253
+ // toolCalls[] entry must NOT carry the stream-only `type` tag.
254
+ assert.ok(!('type' in arrEntry), 'array entry has no type tag');
255
+
256
+ const { out: streamOut } = await runCapture('stream-json', runner, [{ role: 'user', content: 'go' }]);
257
+ const events = streamOut.split('\n').filter((l) => l.trim()).map((l) => JSON.parse(l));
258
+ const toolEvent = events.find((e) => e.type === 'tool');
259
+ // Same enriched rec in the stream event (plus type:'tool').
260
+ assert.strictEqual(toolEvent.type, 'tool');
261
+ assert.strictEqual(toolEvent.tool, 'write');
262
+ assert.deepStrictEqual(toolEvent.args, ['enrich.txt', 'hi']);
263
+ assert.strictEqual(toolEvent.ok, true);
264
+ for (const k of ENRICHED_KEYS) assert.ok(k in toolEvent, `stream tool event has additive ${k}`);
265
+ assert.strictEqual(toolEvent.category, 'file');
266
+ assert.strictEqual(toolEvent.detail.kind, 'diff');
267
+ } finally {
268
+ await mock.close();
269
+ }
270
+ });
271
+
272
+ // §4.2 — enrichment correctness for an ERROR case: status reflects the error,
273
+ // durationMs mirrors ms, category is correct, no detail on errors. Driven through
274
+ // the sink directly so the meta shape (incl. error) is exact and deterministic.
275
+ test('error op enriches with status:error and no detail; legacy ok:false preserved', () => {
276
+ const lines = [];
277
+ const sink = createHeadlessSink('stream-json', (o) => lines.push(o));
278
+ // shell op with non-zero exit → meta.error present (mirrors agent.js).
279
+ sink.callbacks.onToolEnd('exec', 'Command `false`:\nExit code: 3\n', 7, {
280
+ id: 1, call: ['exec', 'false'], attrs: { command: 'false' },
281
+ meta: { exit_code: 3 }, error: { message: 'exit 3', code: 3 },
282
+ });
283
+ const event = lines.find((e) => e.type === 'tool');
284
+ const arrEntry = sink.toolCalls[0];
285
+ for (const rec of [event, arrEntry]) {
286
+ assert.strictEqual(rec.tool, 'exec'); // legacy byte-identical
287
+ assert.deepStrictEqual(rec.args, ['false']);
288
+ assert.strictEqual(rec.ok, false); // legacy ok reflects error
289
+ assert.strictEqual(rec.ms, 7);
290
+ assert.strictEqual(rec.status, 'error'); // enriched status reflects error
291
+ assert.strictEqual(rec.durationMs, 7); // mirrors ms
292
+ assert.strictEqual(rec.category, 'shell');
293
+ assert.strictEqual(rec.detail, null); // errors carry no detail
294
+ assert.deepStrictEqual(rec.error, { message: 'exit 3', code: 3 });
295
+ }
296
+ });
297
+
298
+ // §4.3 — web stays per-op: web_search + http_get emit N enriched {type:'tool'}
299
+ // events, NO collapsed web summary (the headless rail must not import the
300
+ // web-activity collapse). Driven through the sink so no real network is needed.
301
+ test('web ops emit N per-op enriched events (no collapse)', () => {
302
+ const lines = [];
303
+ const sink = createHeadlessSink('stream-json', (o) => lines.push(o));
304
+ sink.callbacks.onToolEnd('web_search', 'results…', 4, {
305
+ id: 1, call: ['web_search', 'cats'], attrs: { query: 'cats' }, meta: null, error: null,
306
+ });
307
+ sink.callbacks.onToolEnd('http_get', 'page body', 9, {
308
+ id: 2, call: ['http_get', 'https://x.test'], attrs: { url: 'https://x.test' },
309
+ meta: { status_code: 200 }, error: null,
310
+ });
311
+ const toolEvents = lines.filter((e) => e.type === 'tool');
312
+ assert.strictEqual(toolEvents.length, 2, 'two per-op tool events, not one summary');
313
+ assert.strictEqual(sink.toolCalls.length, 2, 'two per-op entries in toolCalls[]');
314
+ assert.strictEqual(toolEvents[0].tool, 'web_search');
315
+ assert.strictEqual(toolEvents[1].tool, 'http_get');
316
+ assert.strictEqual(toolEvents[1].category, 'net', 'http_get is category net');
317
+ assert.strictEqual(toolEvents[1].target, 'https://x.test');
318
+ // No collapsed web-summary event leaked into the stream.
319
+ assert.ok(!lines.some((e) => e.type === 'web' || e.type === 'web-summary'), 'no collapsed web summary');
320
+ });
321
+
322
+ // §4.4 — abort / no-descriptor safety: an aborted op (meta.error present, no diff)
323
+ // emits a valid enriched event with status:error and never crashes; a totally
324
+ // missing meta still produces a valid event (buildToolOperation defaults every
325
+ // field, so it never throws) with the legacy fields intact — and if a descriptor
326
+ // genuinely can't be built the bare legacy-only rec is the floor.
327
+ test('aborted op is safe and enriched; missing meta still yields a valid event', () => {
328
+ const lines = [];
329
+ const sink = createHeadlessSink('stream-json', (o) => lines.push(o));
330
+ // Aborted op (mirrors agent.js abort path: meta:null, error:{message:'aborted'}).
331
+ sink.callbacks.onToolEnd('exec', 'User interrupted…', 5, {
332
+ id: 1, call: ['exec', 'sleep 100'], attrs: { command: 'sleep 100' },
333
+ meta: null, error: { message: 'aborted' },
334
+ });
335
+ const aborted = lines.find((e) => e.type === 'tool');
336
+ assert.strictEqual(aborted.ok, false);
337
+ assert.strictEqual(aborted.status, 'error');
338
+ assert.strictEqual(aborted.category, 'shell');
339
+
340
+ // No meta at all → still a valid event, no crash; legacy fields intact.
341
+ sink.callbacks.onToolEnd('weird_tool', 'x', 1, null);
342
+ const bare = lines.filter((e) => e.type === 'tool')[1];
343
+ assert.strictEqual(bare.tool, 'weird_tool');
344
+ assert.deepStrictEqual(bare.args, []);
345
+ assert.strictEqual(bare.ok, true);
346
+ assert.strictEqual(bare.ms, 1);
347
+ assert.strictEqual(bare.type, 'tool');
348
+ });
@@ -0,0 +1,88 @@
1
+ 'use strict';
2
+
3
+ // Unit tests for the pure history helpers extracted in Task 1.5.
4
+
5
+ const { test } = require('node:test');
6
+ const assert = require('node:assert');
7
+
8
+ const { cleanOrphanedToolMessages, reconstructLoadedMessage } = require('../lib/commands/history-utils');
9
+
10
+ test('cleanOrphanedToolMessages keeps fully-paired tool calls/results', () => {
11
+ const msgs = [
12
+ { role: 'user', content: 'hi' },
13
+ { role: 'assistant', content: '', tool_calls: [{ id: 'a', type: 'function', function: { name: 'read', arguments: '{}' } }] },
14
+ { role: 'tool', tool_call_id: 'a', content: 'result' },
15
+ ];
16
+ const r = cleanOrphanedToolMessages(msgs);
17
+ assert.deepStrictEqual(r.messages, msgs);
18
+ assert.strictEqual(r.droppedTool, 0);
19
+ assert.strictEqual(r.droppedAssistantCalls, 0);
20
+ assert.strictEqual(r.droppedAssistantMsgs, 0);
21
+ });
22
+
23
+ test('drops a tool result with no matching tool_call', () => {
24
+ const msgs = [
25
+ { role: 'user', content: 'hi' },
26
+ { role: 'tool', tool_call_id: 'orphan', content: 'x' },
27
+ ];
28
+ const r = cleanOrphanedToolMessages(msgs);
29
+ assert.deepStrictEqual(r.messages, [{ role: 'user', content: 'hi' }]);
30
+ assert.strictEqual(r.droppedTool, 1);
31
+ });
32
+
33
+ test('drops a tool result with empty/missing tool_call_id', () => {
34
+ const r = cleanOrphanedToolMessages([{ role: 'tool', tool_call_id: '', content: 'x' }]);
35
+ assert.strictEqual(r.messages.length, 0);
36
+ assert.strictEqual(r.droppedTool, 1);
37
+ });
38
+
39
+ test('strips an unpaired tool_call but keeps the assistant message when it has content', () => {
40
+ const msgs = [
41
+ { role: 'assistant', content: 'here you go', tool_calls: [{ id: 'unpaired', type: 'function', function: { name: 'x', arguments: '{}' } }] },
42
+ ];
43
+ const r = cleanOrphanedToolMessages(msgs);
44
+ assert.strictEqual(r.messages.length, 1);
45
+ assert.strictEqual(r.messages[0].content, 'here you go');
46
+ assert.ok(!('tool_calls' in r.messages[0]), 'unpaired tool_calls removed');
47
+ assert.strictEqual(r.droppedAssistantCalls, 1);
48
+ assert.strictEqual(r.droppedAssistantMsgs, 0);
49
+ });
50
+
51
+ test('drops an assistant message that has only unpaired tool_calls and no content', () => {
52
+ const msgs = [
53
+ { role: 'assistant', content: '', tool_calls: [{ id: 'unpaired', type: 'function', function: { name: 'x', arguments: '{}' } }] },
54
+ ];
55
+ const r = cleanOrphanedToolMessages(msgs);
56
+ assert.strictEqual(r.messages.length, 0);
57
+ assert.strictEqual(r.droppedAssistantCalls, 1);
58
+ assert.strictEqual(r.droppedAssistantMsgs, 1);
59
+ });
60
+
61
+ test('partially-paired tool_calls keep only the paired ids', () => {
62
+ const msgs = [
63
+ { role: 'assistant', content: '', tool_calls: [
64
+ { id: 'a', type: 'function', function: { name: 'x', arguments: '{}' } },
65
+ { id: 'b', type: 'function', function: { name: 'y', arguments: '{}' } },
66
+ ] },
67
+ { role: 'tool', tool_call_id: 'a', content: 'ra' },
68
+ ];
69
+ const r = cleanOrphanedToolMessages(msgs);
70
+ assert.strictEqual(r.messages[0].tool_calls.length, 1);
71
+ assert.strictEqual(r.messages[0].tool_calls[0].id, 'a');
72
+ assert.strictEqual(r.droppedAssistantCalls, 1);
73
+ });
74
+
75
+ test('reconstructLoadedMessage carries role/content and only present optional fields', () => {
76
+ assert.deepStrictEqual(reconstructLoadedMessage({ role: 'user', content: 'hi' }), { role: 'user', content: 'hi' });
77
+ assert.deepStrictEqual(
78
+ reconstructLoadedMessage({ role: 'tool', content: 'r', tool_call_id: 'a' }),
79
+ { role: 'tool', content: 'r', tool_call_id: 'a' },
80
+ );
81
+ // Empty tool_call_id and empty tool_calls are omitted.
82
+ assert.deepStrictEqual(
83
+ reconstructLoadedMessage({ role: 'tool', content: 'r', tool_call_id: '', tool_calls: [] }),
84
+ { role: 'tool', content: 'r' },
85
+ );
86
+ const withCalls = reconstructLoadedMessage({ role: 'assistant', content: '', tool_calls: [{ id: 'a' }] });
87
+ assert.deepStrictEqual(withCalls.tool_calls, [{ id: 'a' }]);
88
+ });